xref: /freebsd/sys/dev/sfxge/sfxge_rx.c (revision b00fe64f4acfe315181f65999af16e9a7bdc600b)
1 /*-
2  * Copyright (c) 2010-2011 Solarflare Communications, Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/types.h>
34 #include <sys/mbuf.h>
35 #include <sys/smp.h>
36 #include <sys/socket.h>
37 #include <sys/sysctl.h>
38 #include <sys/limits.h>
39 #include <sys/syslog.h>
40 
41 #include <net/ethernet.h>
42 #include <net/if.h>
43 #include <net/if_vlan_var.h>
44 
45 #include <netinet/in.h>
46 #include <netinet/ip.h>
47 #include <netinet/ip6.h>
48 #include <netinet/tcp.h>
49 
50 #include <machine/in_cksum.h>
51 
52 #include "common/efx.h"
53 
54 
55 #include "sfxge.h"
56 #include "sfxge_rx.h"
57 
58 #define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
59 
60 #ifdef SFXGE_LRO
61 
62 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
63 	    "Large receive offload (LRO) parameters");
64 
65 #define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
66 
67 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
68  * means we can accelerate a larger number of streams.
69  */
70 static unsigned lro_table_size = 128;
71 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
72 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
73 	    &lro_table_size, 0,
74 	    "Size of the LRO hash table (must be a power of 2)");
75 
76 /* Maximum length of a hash chain.  If chains get too long then the lookup
77  * time increases and may exceed the benefit of LRO.
78  */
79 static unsigned lro_chain_max = 20;
80 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
81 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
82 	    &lro_chain_max, 0,
83 	    "The maximum length of a hash chain");
84 
85 /* Maximum time (in ticks) that a connection can be idle before it's LRO
86  * state is discarded.
87  */
88 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
89 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
90 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
91 	    &lro_idle_ticks, 0,
92 	    "The maximum time (in ticks) that a connection can be idle "
93 	    "before it's LRO state is discarded");
94 
95 /* Number of packets with payload that must arrive in-order before a
96  * connection is eligible for LRO.  The idea is we should avoid coalescing
97  * segments when the sender is in slow-start because reducing the ACK rate
98  * can damage performance.
99  */
100 static int lro_slow_start_packets = 2000;
101 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
102 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
103 	    &lro_slow_start_packets, 0,
104 	    "Number of packets with payload that must arrive in-order before "
105 	    "a connection is eligible for LRO");
106 
107 /* Number of packets with payload that must arrive in-order following loss
108  * before a connection is eligible for LRO.  The idea is we should avoid
109  * coalescing segments when the sender is recovering from loss, because
110  * reducing the ACK rate can damage performance.
111  */
112 static int lro_loss_packets = 20;
113 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
114 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
115 	    &lro_loss_packets, 0,
116 	    "Number of packets with payload that must arrive in-order "
117 	    "following loss before a connection is eligible for LRO");
118 
119 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
120 #define	SFXGE_LRO_L2_ID_VLAN 0x4000
121 #define	SFXGE_LRO_L2_ID_IPV6 0x8000
122 #define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
123 #define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
124 
125 /* Compare IPv6 addresses, avoiding conditional branches */
126 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
127 				   const struct in6_addr *right)
128 {
129 #if LONG_BIT == 64
130 	const uint64_t *left64 = (const uint64_t *)left;
131 	const uint64_t *right64 = (const uint64_t *)right;
132 	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
133 #else
134 	return (left->s6_addr32[0] - right->s6_addr32[0]) |
135 	       (left->s6_addr32[1] - right->s6_addr32[1]) |
136 	       (left->s6_addr32[2] - right->s6_addr32[2]) |
137 	       (left->s6_addr32[3] - right->s6_addr32[3]);
138 #endif
139 }
140 
141 #endif	/* SFXGE_LRO */
142 
143 void
144 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
145 {
146 
147 	rxq->flush_state = SFXGE_FLUSH_DONE;
148 }
149 
150 void
151 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
152 {
153 
154 	rxq->flush_state = SFXGE_FLUSH_FAILED;
155 }
156 
157 static uint8_t toep_key[] = {
158 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
159 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
160 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
161 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
162 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
163 };
164 
165 static void
166 sfxge_rx_post_refill(void *arg)
167 {
168 	struct sfxge_rxq *rxq = arg;
169 	struct sfxge_softc *sc;
170 	unsigned int index;
171 	struct sfxge_evq *evq;
172 	uint16_t magic;
173 
174 	sc = rxq->sc;
175 	index = rxq->index;
176 	evq = sc->evq[index];
177 
178 	magic = SFXGE_MAGIC_RX_QREFILL | index;
179 
180 	/* This is guaranteed due to the start/stop order of rx and ev */
181 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
182 	    ("evq not started"));
183 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
184 	    ("rxq not started"));
185 	efx_ev_qpost(evq->common, magic);
186 }
187 
188 static void
189 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
190 {
191 	/* Initially retry after 100 ms, but back off in case of
192 	 * repeated failures as we probably have to wait for the
193 	 * administrator to raise the pool limit. */
194 	if (retrying)
195 		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
196 	else
197 		rxq->refill_delay = hz / 10;
198 
199 	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
200 			     sfxge_rx_post_refill, rxq);
201 }
202 
203 static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
204 {
205 	struct mb_args args;
206 	struct mbuf *m;
207 
208 	/* Allocate mbuf structure */
209 	args.flags = M_PKTHDR;
210 	args.type = MT_DATA;
211 	m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
212 
213 	/* Allocate (and attach) packet buffer */
214 	if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
215 		uma_zfree(zone_mbuf, m);
216 		m = NULL;
217 	}
218 
219 	return (m);
220 }
221 
222 #define	SFXGE_REFILL_BATCH  64
223 
224 static void
225 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
226 {
227 	struct sfxge_softc *sc;
228 	unsigned int index;
229 	struct sfxge_evq *evq;
230 	unsigned int batch;
231 	unsigned int rxfill;
232 	unsigned int mblksize;
233 	int ntodo;
234 	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
235 
236 	sc = rxq->sc;
237 	index = rxq->index;
238 	evq = sc->evq[index];
239 
240 	prefetch_read_many(sc->enp);
241 	prefetch_read_many(rxq->common);
242 
243 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
244 
245 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
246 		return;
247 
248 	rxfill = rxq->added - rxq->completed;
249 	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
250 	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
251 	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
252 	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
253 	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
254 
255 	if (ntodo == 0)
256 		return;
257 
258 	batch = 0;
259 	mblksize = sc->rx_buffer_size;
260 	while (ntodo-- > 0) {
261 		unsigned int id;
262 		struct sfxge_rx_sw_desc *rx_desc;
263 		bus_dma_segment_t seg;
264 		struct mbuf *m;
265 
266 		id = (rxq->added + batch) & rxq->ptr_mask;
267 		rx_desc = &rxq->queue[id];
268 		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
269 
270 		rx_desc->flags = EFX_DISCARD;
271 		m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
272 		if (m == NULL)
273 			break;
274 		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
275 		addr[batch++] = seg.ds_addr;
276 
277 		if (batch == SFXGE_REFILL_BATCH) {
278 			efx_rx_qpost(rxq->common, addr, mblksize, batch,
279 			    rxq->completed, rxq->added);
280 			rxq->added += batch;
281 			batch = 0;
282 		}
283 	}
284 
285 	if (ntodo != 0)
286 		sfxge_rx_schedule_refill(rxq, retrying);
287 
288 	if (batch != 0) {
289 		efx_rx_qpost(rxq->common, addr, mblksize, batch,
290 		    rxq->completed, rxq->added);
291 		rxq->added += batch;
292 	}
293 
294 	/* Make the descriptors visible to the hardware */
295 	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
296 			BUS_DMASYNC_PREWRITE);
297 
298 	efx_rx_qpush(rxq->common, rxq->added);
299 }
300 
301 void
302 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
303 {
304 
305 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
306 		return;
307 
308 	/* Make sure the queue is full */
309 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
310 }
311 
312 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
313 {
314 	struct ifnet *ifp = sc->ifnet;
315 
316 	m->m_pkthdr.rcvif = ifp;
317 	m->m_pkthdr.csum_data = 0xffff;
318 	ifp->if_input(ifp, m);
319 }
320 
321 static void
322 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
323 {
324 	struct mbuf *m = rx_desc->mbuf;
325 	int flags = rx_desc->flags;
326 	int csum_flags;
327 
328 	/* Convert checksum flags */
329 	csum_flags = (flags & EFX_CKSUM_IPV4) ?
330 		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
331 	if (flags & EFX_CKSUM_TCPUDP)
332 		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
333 
334 	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
335 		m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
336 						       mtod(m, uint8_t *));
337 		/* The hash covers a 4-tuple for TCP only */
338 		M_HASHTYPE_SET(m,
339 		    (flags & EFX_PKT_IPV4) ?
340 			((flags & EFX_PKT_TCP) ?
341 			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
342 			((flags & EFX_PKT_TCP) ?
343 			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
344 	}
345 	m->m_data += sc->rx_prefix_size;
346 	m->m_len = rx_desc->size - sc->rx_prefix_size;
347 	m->m_pkthdr.len = m->m_len;
348 	m->m_pkthdr.csum_flags = csum_flags;
349 	__sfxge_rx_deliver(sc, rx_desc->mbuf);
350 
351 	rx_desc->flags = EFX_DISCARD;
352 	rx_desc->mbuf = NULL;
353 }
354 
355 #ifdef SFXGE_LRO
356 
357 static void
358 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
359 {
360 	struct sfxge_softc *sc = st->sc;
361 	struct mbuf *m = c->mbuf;
362 	struct tcphdr *c_th;
363 	int csum_flags;
364 
365 	KASSERT(m, ("no mbuf to deliver"));
366 
367 	++st->n_bursts;
368 
369 	/* Finish off packet munging and recalculate IP header checksum. */
370 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
371 		struct ip *iph = c->nh;
372 		iph->ip_len = htons(iph->ip_len);
373 		iph->ip_sum = 0;
374 		iph->ip_sum = in_cksum_hdr(iph);
375 		c_th = (struct tcphdr *)(iph + 1);
376 		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
377 			      CSUM_IP_CHECKED | CSUM_IP_VALID);
378 	} else {
379 		struct ip6_hdr *iph = c->nh;
380 		iph->ip6_plen = htons(iph->ip6_plen);
381 		c_th = (struct tcphdr *)(iph + 1);
382 		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
383 	}
384 
385 	c_th->th_win = c->th_last->th_win;
386 	c_th->th_ack = c->th_last->th_ack;
387 	if (c_th->th_off == c->th_last->th_off) {
388 		/* Copy TCP options (take care to avoid going negative). */
389 		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
390 		memcpy(c_th + 1, c->th_last + 1, optlen);
391 	}
392 
393 	m->m_pkthdr.flowid = c->conn_hash;
394 	M_HASHTYPE_SET(m,
395 	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
396 		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
397 
398 	m->m_pkthdr.csum_flags = csum_flags;
399 	__sfxge_rx_deliver(sc, m);
400 
401 	c->mbuf = NULL;
402 	c->delivered = 1;
403 }
404 
405 /* Drop the given connection, and add it to the free list. */
406 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
407 {
408 	unsigned bucket;
409 
410 	KASSERT(!c->mbuf, ("found orphaned mbuf"));
411 
412 	if (c->next_buf.mbuf != NULL) {
413 		sfxge_rx_deliver(rxq->sc, &c->next_buf);
414 		LIST_REMOVE(c, active_link);
415 	}
416 
417 	bucket = c->conn_hash & rxq->lro.conns_mask;
418 	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
419 	--rxq->lro.conns_n[bucket];
420 	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
421 	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
422 }
423 
424 /* Stop tracking connections that have gone idle in order to keep hash
425  * chains short.
426  */
427 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
428 {
429 	struct sfxge_lro_conn *c;
430 	unsigned i;
431 
432 	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
433 		("found active connections"));
434 
435 	rxq->lro.last_purge_ticks = now;
436 	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
437 		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
438 			continue;
439 
440 		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
441 		if (now - c->last_pkt_ticks > lro_idle_ticks) {
442 			++rxq->lro.n_drop_idle;
443 			sfxge_lro_drop(rxq, c);
444 		}
445 	}
446 }
447 
448 static void
449 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
450 		struct mbuf *mbuf, struct tcphdr *th)
451 {
452 	struct tcphdr *c_th;
453 
454 	/* Tack the new mbuf onto the chain. */
455 	KASSERT(!mbuf->m_next, ("mbuf already chained"));
456 	c->mbuf_tail->m_next = mbuf;
457 	c->mbuf_tail = mbuf;
458 
459 	/* Increase length appropriately */
460 	c->mbuf->m_pkthdr.len += mbuf->m_len;
461 
462 	/* Update the connection state flags */
463 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
464 		struct ip *iph = c->nh;
465 		iph->ip_len += mbuf->m_len;
466 		c_th = (struct tcphdr *)(iph + 1);
467 	} else {
468 		struct ip6_hdr *iph = c->nh;
469 		iph->ip6_plen += mbuf->m_len;
470 		c_th = (struct tcphdr *)(iph + 1);
471 	}
472 	c_th->th_flags |= (th->th_flags & TH_PUSH);
473 	c->th_last = th;
474 	++st->n_merges;
475 
476 	/* Pass packet up now if another segment could overflow the IP
477 	 * length.
478 	 */
479 	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
480 		sfxge_lro_deliver(st, c);
481 }
482 
483 static void
484 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
485 		struct mbuf *mbuf, void *nh, struct tcphdr *th)
486 {
487 	/* Start the chain */
488 	c->mbuf = mbuf;
489 	c->mbuf_tail = c->mbuf;
490 	c->nh = nh;
491 	c->th_last = th;
492 
493 	mbuf->m_pkthdr.len = mbuf->m_len;
494 
495 	/* Mangle header fields for later processing */
496 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
497 		struct ip *iph = nh;
498 		iph->ip_len = ntohs(iph->ip_len);
499 	} else {
500 		struct ip6_hdr *iph = nh;
501 		iph->ip6_plen = ntohs(iph->ip6_plen);
502 	}
503 }
504 
505 /* Try to merge or otherwise hold or deliver (as appropriate) the
506  * packet buffered for this connection (c->next_buf).  Return a flag
507  * indicating whether the connection is still active for LRO purposes.
508  */
509 static int
510 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
511 {
512 	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
513 	char *eh = c->next_eh;
514 	int data_length, hdr_length, dont_merge;
515 	unsigned th_seq, pkt_length;
516 	struct tcphdr *th;
517 	unsigned now;
518 
519 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
520 		struct ip *iph = c->next_nh;
521 		th = (struct tcphdr *)(iph + 1);
522 		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
523 	} else {
524 		struct ip6_hdr *iph = c->next_nh;
525 		th = (struct tcphdr *)(iph + 1);
526 		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
527 	}
528 
529 	hdr_length = (char *) th + th->th_off * 4 - eh;
530 	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
531 		       hdr_length);
532 	th_seq = ntohl(th->th_seq);
533 	dont_merge = ((data_length <= 0)
534 		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
535 
536 	/* Check for options other than aligned timestamp. */
537 	if (th->th_off != 5) {
538 		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
539 		if (th->th_off == 8 &&
540 		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
541 					(TCPOPT_NOP << 16) |
542 					(TCPOPT_TIMESTAMP << 8) |
543 					TCPOLEN_TIMESTAMP)) {
544 			/* timestamp option -- okay */
545 		} else {
546 			dont_merge = 1;
547 		}
548 	}
549 
550 	if (__predict_false(th_seq != c->next_seq)) {
551 		/* Out-of-order, so start counting again. */
552 		if (c->mbuf != NULL)
553 			sfxge_lro_deliver(&rxq->lro, c);
554 		c->n_in_order_pkts -= lro_loss_packets;
555 		c->next_seq = th_seq + data_length;
556 		++rxq->lro.n_misorder;
557 		goto deliver_buf_out;
558 	}
559 	c->next_seq = th_seq + data_length;
560 
561 	now = ticks;
562 	if (now - c->last_pkt_ticks > lro_idle_ticks) {
563 		++rxq->lro.n_drop_idle;
564 		if (c->mbuf != NULL)
565 			sfxge_lro_deliver(&rxq->lro, c);
566 		sfxge_lro_drop(rxq, c);
567 		return (0);
568 	}
569 	c->last_pkt_ticks = ticks;
570 
571 	if (c->n_in_order_pkts < lro_slow_start_packets) {
572 		/* May be in slow-start, so don't merge. */
573 		++rxq->lro.n_slow_start;
574 		++c->n_in_order_pkts;
575 		goto deliver_buf_out;
576 	}
577 
578 	if (__predict_false(dont_merge)) {
579 		if (c->mbuf != NULL)
580 			sfxge_lro_deliver(&rxq->lro, c);
581 		if (th->th_flags & (TH_FIN | TH_RST)) {
582 			++rxq->lro.n_drop_closed;
583 			sfxge_lro_drop(rxq, c);
584 			return (0);
585 		}
586 		goto deliver_buf_out;
587 	}
588 
589 	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
590 
591 	if (__predict_true(c->mbuf != NULL)) {
592 		/* Remove headers and any padding */
593 		rx_buf->mbuf->m_data += hdr_length;
594 		rx_buf->mbuf->m_len = data_length;
595 
596 		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
597 	} else {
598 		/* Remove any padding */
599 		rx_buf->mbuf->m_len = pkt_length;
600 
601 		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
602 	}
603 
604 	rx_buf->mbuf = NULL;
605 	return (1);
606 
607  deliver_buf_out:
608 	sfxge_rx_deliver(rxq->sc, rx_buf);
609 	return (1);
610 }
611 
612 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
613 			       uint16_t l2_id, void *nh, struct tcphdr *th)
614 {
615 	unsigned bucket = conn_hash & st->conns_mask;
616 	struct sfxge_lro_conn *c;
617 
618 	if (st->conns_n[bucket] >= lro_chain_max) {
619 		++st->n_too_many;
620 		return;
621 	}
622 
623 	if (!TAILQ_EMPTY(&st->free_conns)) {
624 		c = TAILQ_FIRST(&st->free_conns);
625 		TAILQ_REMOVE(&st->free_conns, c, link);
626 	} else {
627 		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
628 		if (c == NULL)
629 			return;
630 		c->mbuf = NULL;
631 		c->next_buf.mbuf = NULL;
632 	}
633 
634 	/* Create the connection tracking data */
635 	++st->conns_n[bucket];
636 	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
637 	c->l2_id = l2_id;
638 	c->conn_hash = conn_hash;
639 	c->source = th->th_sport;
640 	c->dest = th->th_dport;
641 	c->n_in_order_pkts = 0;
642 	c->last_pkt_ticks = *(volatile int *)&ticks;
643 	c->delivered = 0;
644 	++st->n_new_stream;
645 	/* NB. We don't initialise c->next_seq, and it doesn't matter what
646 	 * value it has.  Most likely the next packet received for this
647 	 * connection will not match -- no harm done.
648 	 */
649 }
650 
651 /* Process mbuf and decide whether to dispatch it to the stack now or
652  * later.
653  */
654 static void
655 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
656 {
657 	struct sfxge_softc *sc = rxq->sc;
658 	struct mbuf *m = rx_buf->mbuf;
659 	struct ether_header *eh;
660 	struct sfxge_lro_conn *c;
661 	uint16_t l2_id;
662 	uint16_t l3_proto;
663 	void *nh;
664 	struct tcphdr *th;
665 	uint32_t conn_hash;
666 	unsigned bucket;
667 
668 	/* Get the hardware hash */
669 	conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
670 				      mtod(m, uint8_t *));
671 
672 	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
673 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
674 		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
675 		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
676 			SFXGE_LRO_L2_ID_VLAN;
677 		l3_proto = veh->evl_proto;
678 		nh = veh + 1;
679 	} else {
680 		l2_id = 0;
681 		l3_proto = eh->ether_type;
682 		nh = eh + 1;
683 	}
684 
685 	/* Check whether this is a suitable packet (unfragmented
686 	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
687 	 * length, and compute a hash if necessary.  If not, return.
688 	 */
689 	if (l3_proto == htons(ETHERTYPE_IP)) {
690 		struct ip *iph = nh;
691 		if ((iph->ip_p - IPPROTO_TCP) |
692 		    (iph->ip_hl - (sizeof(*iph) >> 2u)) |
693 		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
694 			goto deliver_now;
695 		th = (struct tcphdr *)(iph + 1);
696 	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
697 		struct ip6_hdr *iph = nh;
698 		if (iph->ip6_nxt != IPPROTO_TCP)
699 			goto deliver_now;
700 		l2_id |= SFXGE_LRO_L2_ID_IPV6;
701 		th = (struct tcphdr *)(iph + 1);
702 	} else {
703 		goto deliver_now;
704 	}
705 
706 	bucket = conn_hash & rxq->lro.conns_mask;
707 
708 	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
709 		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
710 			continue;
711 		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
712 			continue;
713 		if (c->mbuf != NULL) {
714 			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
715 				struct ip *c_iph, *iph = nh;
716 				c_iph = c->nh;
717 				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
718 				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
719 					continue;
720 			} else {
721 				struct ip6_hdr *c_iph, *iph = nh;
722 				c_iph = c->nh;
723 				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
724 				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
725 					continue;
726 			}
727 		}
728 
729 		/* Re-insert at head of list to reduce lookup time. */
730 		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
731 		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
732 
733 		if (c->next_buf.mbuf != NULL) {
734 			if (!sfxge_lro_try_merge(rxq, c))
735 				goto deliver_now;
736 		} else {
737 			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
738 			    active_link);
739 		}
740 		c->next_buf = *rx_buf;
741 		c->next_eh = eh;
742 		c->next_nh = nh;
743 
744 		rx_buf->mbuf = NULL;
745 		rx_buf->flags = EFX_DISCARD;
746 		return;
747 	}
748 
749 	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
750  deliver_now:
751 	sfxge_rx_deliver(sc, rx_buf);
752 }
753 
754 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
755 {
756 	struct sfxge_lro_state *st = &rxq->lro;
757 	struct sfxge_lro_conn *c;
758 	unsigned t;
759 
760 	while (!LIST_EMPTY(&st->active_conns)) {
761 		c = LIST_FIRST(&st->active_conns);
762 		if (!c->delivered && c->mbuf != NULL)
763 			sfxge_lro_deliver(st, c);
764 		if (sfxge_lro_try_merge(rxq, c)) {
765 			if (c->mbuf != NULL)
766 				sfxge_lro_deliver(st, c);
767 			LIST_REMOVE(c, active_link);
768 		}
769 		c->delivered = 0;
770 	}
771 
772 	t = *(volatile int *)&ticks;
773 	if (__predict_false(t != st->last_purge_ticks))
774 		sfxge_lro_purge_idle(rxq, t);
775 }
776 
777 #else	/* !SFXGE_LRO */
778 
779 static void
780 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
781 {
782 }
783 
784 static void
785 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
786 {
787 }
788 
789 #endif	/* SFXGE_LRO */
790 
791 void
792 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
793 {
794 	struct sfxge_softc *sc = rxq->sc;
795 	int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
796 	unsigned int index;
797 	struct sfxge_evq *evq;
798 	unsigned int completed;
799 	unsigned int level;
800 	struct mbuf *m;
801 	struct sfxge_rx_sw_desc *prev = NULL;
802 
803 	index = rxq->index;
804 	evq = sc->evq[index];
805 
806 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
807 
808 	completed = rxq->completed;
809 	while (completed != rxq->pending) {
810 		unsigned int id;
811 		struct sfxge_rx_sw_desc *rx_desc;
812 
813 		id = completed++ & rxq->ptr_mask;
814 		rx_desc = &rxq->queue[id];
815 		m = rx_desc->mbuf;
816 
817 		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
818 			goto discard;
819 
820 		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
821 			goto discard;
822 
823 		prefetch_read_many(mtod(m, caddr_t));
824 
825 		/* Check for loopback packets */
826 		if (!(rx_desc->flags & EFX_PKT_IPV4) &&
827 		    !(rx_desc->flags & EFX_PKT_IPV6)) {
828 			struct ether_header *etherhp;
829 
830 			/*LINTED*/
831 			etherhp = mtod(m, struct ether_header *);
832 
833 			if (etherhp->ether_type ==
834 			    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
835 				EFSYS_PROBE(loopback);
836 
837 				rxq->loopback++;
838 				goto discard;
839 			}
840 		}
841 
842 		/* Pass packet up the stack or into LRO (pipelined) */
843 		if (prev != NULL) {
844 			if (lro_enabled)
845 				sfxge_lro(rxq, prev);
846 			else
847 				sfxge_rx_deliver(sc, prev);
848 		}
849 		prev = rx_desc;
850 		continue;
851 
852 discard:
853 		/* Return the packet to the pool */
854 		m_free(m);
855 		rx_desc->mbuf = NULL;
856 	}
857 	rxq->completed = completed;
858 
859 	level = rxq->added - rxq->completed;
860 
861 	/* Pass last packet up the stack or into LRO */
862 	if (prev != NULL) {
863 		if (lro_enabled)
864 			sfxge_lro(rxq, prev);
865 		else
866 			sfxge_rx_deliver(sc, prev);
867 	}
868 
869 	/*
870 	 * If there are any pending flows and this is the end of the
871 	 * poll then they must be completed.
872 	 */
873 	if (eop)
874 		sfxge_lro_end_of_burst(rxq);
875 
876 	/* Top up the queue if necessary */
877 	if (level < rxq->refill_threshold)
878 		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
879 }
880 
881 static void
882 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
883 {
884 	struct sfxge_rxq *rxq;
885 	struct sfxge_evq *evq;
886 	unsigned int count;
887 
888 	rxq = sc->rxq[index];
889 	evq = sc->evq[index];
890 
891 	SFXGE_EVQ_LOCK(evq);
892 
893 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
894 	    ("rxq not started"));
895 
896 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
897 
898 	callout_stop(&rxq->refill_callout);
899 
900 again:
901 	rxq->flush_state = SFXGE_FLUSH_PENDING;
902 
903 	/* Flush the receive queue */
904 	efx_rx_qflush(rxq->common);
905 
906 	SFXGE_EVQ_UNLOCK(evq);
907 
908 	count = 0;
909 	do {
910 		/* Spin for 100 ms */
911 		DELAY(100000);
912 
913 		if (rxq->flush_state != SFXGE_FLUSH_PENDING)
914 			break;
915 
916 	} while (++count < 20);
917 
918 	SFXGE_EVQ_LOCK(evq);
919 
920 	if (rxq->flush_state == SFXGE_FLUSH_FAILED)
921 		goto again;
922 
923 	rxq->flush_state = SFXGE_FLUSH_DONE;
924 
925 	rxq->pending = rxq->added;
926 	sfxge_rx_qcomplete(rxq, B_TRUE);
927 
928 	KASSERT(rxq->completed == rxq->pending,
929 	    ("rxq->completed != rxq->pending"));
930 
931 	rxq->added = 0;
932 	rxq->pending = 0;
933 	rxq->completed = 0;
934 	rxq->loopback = 0;
935 
936 	/* Destroy the common code receive queue. */
937 	efx_rx_qdestroy(rxq->common);
938 
939 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
940 	    EFX_RXQ_NBUFS(sc->rxq_entries));
941 
942 	SFXGE_EVQ_UNLOCK(evq);
943 }
944 
945 static int
946 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
947 {
948 	struct sfxge_rxq *rxq;
949 	efsys_mem_t *esmp;
950 	struct sfxge_evq *evq;
951 	int rc;
952 
953 	rxq = sc->rxq[index];
954 	esmp = &rxq->mem;
955 	evq = sc->evq[index];
956 
957 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
958 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
959 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
960 	    ("evq->init_state != SFXGE_EVQ_STARTED"));
961 
962 	/* Program the buffer table. */
963 	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
964 	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
965 		return (rc);
966 
967 	/* Create the common code receive queue. */
968 	if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
969 	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
970 	    &rxq->common)) != 0)
971 		goto fail;
972 
973 	SFXGE_EVQ_LOCK(evq);
974 
975 	/* Enable the receive queue. */
976 	efx_rx_qenable(rxq->common);
977 
978 	rxq->init_state = SFXGE_RXQ_STARTED;
979 
980 	/* Try to fill the queue from the pool. */
981 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
982 
983 	SFXGE_EVQ_UNLOCK(evq);
984 
985 	return (0);
986 
987 fail:
988 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
989 	    EFX_RXQ_NBUFS(sc->rxq_entries));
990 	return (rc);
991 }
992 
993 void
994 sfxge_rx_stop(struct sfxge_softc *sc)
995 {
996 	int index;
997 
998 	/* Stop the receive queue(s) */
999 	index = sc->rxq_count;
1000 	while (--index >= 0)
1001 		sfxge_rx_qstop(sc, index);
1002 
1003 	sc->rx_prefix_size = 0;
1004 	sc->rx_buffer_size = 0;
1005 
1006 	efx_rx_fini(sc->enp);
1007 }
1008 
1009 int
1010 sfxge_rx_start(struct sfxge_softc *sc)
1011 {
1012 	struct sfxge_intr *intr;
1013 	int index;
1014 	int rc;
1015 
1016 	intr = &sc->intr;
1017 
1018 	/* Initialize the common code receive module. */
1019 	if ((rc = efx_rx_init(sc->enp)) != 0)
1020 		return (rc);
1021 
1022 	/* Calculate the receive packet buffer size. */
1023 	sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
1024 	sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
1025 			      sc->rx_prefix_size);
1026 
1027 	/* Select zone for packet buffers */
1028 	if (sc->rx_buffer_size <= MCLBYTES)
1029 		sc->rx_buffer_zone = zone_clust;
1030 	else if (sc->rx_buffer_size <= MJUMPAGESIZE)
1031 		sc->rx_buffer_zone = zone_jumbop;
1032 	else if (sc->rx_buffer_size <= MJUM9BYTES)
1033 		sc->rx_buffer_zone = zone_jumbo9;
1034 	else
1035 		sc->rx_buffer_zone = zone_jumbo16;
1036 
1037 	/*
1038 	 * Set up the scale table.  Enable all hash types and hash insertion.
1039 	 */
1040 	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1041 		sc->rx_indir_table[index] = index % sc->rxq_count;
1042 	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1043 				       SFXGE_RX_SCALE_MAX)) != 0)
1044 		goto fail;
1045 	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1046 	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1047 	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1048 
1049 	if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1050 	    sizeof(toep_key))) != 0)
1051 		goto fail;
1052 
1053 	/* Start the receive queue(s). */
1054 	for (index = 0; index < sc->rxq_count; index++) {
1055 		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1056 			goto fail2;
1057 	}
1058 
1059 	return (0);
1060 
1061 fail2:
1062 	while (--index >= 0)
1063 		sfxge_rx_qstop(sc, index);
1064 
1065 fail:
1066 	efx_rx_fini(sc->enp);
1067 
1068 	return (rc);
1069 }
1070 
1071 #ifdef SFXGE_LRO
1072 
1073 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1074 {
1075 	struct sfxge_lro_state *st = &rxq->lro;
1076 	unsigned i;
1077 
1078 	st->conns_mask = lro_table_size - 1;
1079 	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1080 		("lro_table_size must be a power of 2"));
1081 	st->sc = rxq->sc;
1082 	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1083 			   M_SFXGE, M_WAITOK);
1084 	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1085 			     M_SFXGE, M_WAITOK);
1086 	for (i = 0; i <= st->conns_mask; ++i) {
1087 		TAILQ_INIT(&st->conns[i]);
1088 		st->conns_n[i] = 0;
1089 	}
1090 	LIST_INIT(&st->active_conns);
1091 	TAILQ_INIT(&st->free_conns);
1092 }
1093 
1094 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1095 {
1096 	struct sfxge_lro_state *st = &rxq->lro;
1097 	struct sfxge_lro_conn *c;
1098 	unsigned i;
1099 
1100 	/* Return cleanly if sfxge_lro_init() has not been called. */
1101 	if (st->conns == NULL)
1102 		return;
1103 
1104 	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1105 
1106 	for (i = 0; i <= st->conns_mask; ++i) {
1107 		while (!TAILQ_EMPTY(&st->conns[i])) {
1108 			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1109 			sfxge_lro_drop(rxq, c);
1110 		}
1111 	}
1112 
1113 	while (!TAILQ_EMPTY(&st->free_conns)) {
1114 		c = TAILQ_FIRST(&st->free_conns);
1115 		TAILQ_REMOVE(&st->free_conns, c, link);
1116 		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1117 		free(c, M_SFXGE);
1118 	}
1119 
1120 	free(st->conns_n, M_SFXGE);
1121 	free(st->conns, M_SFXGE);
1122 	st->conns = NULL;
1123 }
1124 
1125 #else
1126 
1127 static void
1128 sfxge_lro_init(struct sfxge_rxq *rxq)
1129 {
1130 }
1131 
1132 static void
1133 sfxge_lro_fini(struct sfxge_rxq *rxq)
1134 {
1135 }
1136 
1137 #endif	/* SFXGE_LRO */
1138 
1139 static void
1140 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1141 {
1142 	struct sfxge_rxq *rxq;
1143 
1144 	rxq = sc->rxq[index];
1145 
1146 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1147 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1148 
1149 	/* Free the context array and the flow table. */
1150 	free(rxq->queue, M_SFXGE);
1151 	sfxge_lro_fini(rxq);
1152 
1153 	/* Release DMA memory. */
1154 	sfxge_dma_free(&rxq->mem);
1155 
1156 	sc->rxq[index] = NULL;
1157 
1158 	free(rxq, M_SFXGE);
1159 }
1160 
1161 static int
1162 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1163 {
1164 	struct sfxge_rxq *rxq;
1165 	struct sfxge_evq *evq;
1166 	efsys_mem_t *esmp;
1167 	int rc;
1168 
1169 	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1170 
1171 	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1172 	rxq->sc = sc;
1173 	rxq->index = index;
1174 	rxq->entries = sc->rxq_entries;
1175 	rxq->ptr_mask = rxq->entries - 1;
1176 	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1177 
1178 	sc->rxq[index] = rxq;
1179 	esmp = &rxq->mem;
1180 
1181 	evq = sc->evq[index];
1182 
1183 	/* Allocate and zero DMA space. */
1184 	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1185 		return (rc);
1186 
1187 	/* Allocate buffer table entries. */
1188 	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1189 				 &rxq->buf_base_id);
1190 
1191 	/* Allocate the context array and the flow table. */
1192 	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1193 	    M_SFXGE, M_WAITOK | M_ZERO);
1194 	sfxge_lro_init(rxq);
1195 
1196 	callout_init(&rxq->refill_callout, B_TRUE);
1197 
1198 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1199 
1200 	return (0);
1201 }
1202 
1203 static const struct {
1204 	const char *name;
1205 	size_t offset;
1206 } sfxge_rx_stats[] = {
1207 #define	SFXGE_RX_STAT(name, member) \
1208 	{ #name, offsetof(struct sfxge_rxq, member) }
1209 #ifdef SFXGE_LRO
1210 	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1211 	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1212 	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1213 	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1214 	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1215 	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1216 	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1217 	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1218 #endif
1219 };
1220 
1221 static int
1222 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1223 {
1224 	struct sfxge_softc *sc = arg1;
1225 	unsigned int id = arg2;
1226 	unsigned int sum, index;
1227 
1228 	/* Sum across all RX queues */
1229 	sum = 0;
1230 	for (index = 0; index < sc->rxq_count; index++)
1231 		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1232 					 sfxge_rx_stats[id].offset);
1233 
1234 	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1235 }
1236 
1237 static void
1238 sfxge_rx_stat_init(struct sfxge_softc *sc)
1239 {
1240 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1241 	struct sysctl_oid_list *stat_list;
1242 	unsigned int id;
1243 
1244 	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1245 
1246 	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1247 		SYSCTL_ADD_PROC(
1248 			ctx, stat_list,
1249 			OID_AUTO, sfxge_rx_stats[id].name,
1250 			CTLTYPE_UINT|CTLFLAG_RD,
1251 			sc, id, sfxge_rx_stat_handler, "IU",
1252 			"");
1253 	}
1254 }
1255 
1256 void
1257 sfxge_rx_fini(struct sfxge_softc *sc)
1258 {
1259 	int index;
1260 
1261 	index = sc->rxq_count;
1262 	while (--index >= 0)
1263 		sfxge_rx_qfini(sc, index);
1264 
1265 	sc->rxq_count = 0;
1266 }
1267 
1268 int
1269 sfxge_rx_init(struct sfxge_softc *sc)
1270 {
1271 	struct sfxge_intr *intr;
1272 	int index;
1273 	int rc;
1274 
1275 #ifdef SFXGE_LRO
1276 	if (!ISP2(lro_table_size)) {
1277 		log(LOG_ERR, "%s=%u must be power of 2",
1278 		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1279 		rc = EINVAL;
1280 		goto fail_lro_table_size;
1281 	}
1282 
1283 	if (lro_idle_ticks == 0)
1284 		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1285 #endif
1286 
1287 	intr = &sc->intr;
1288 
1289 	sc->rxq_count = intr->n_alloc;
1290 
1291 	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1292 	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1293 
1294 	/* Initialize the receive queue(s) - one per interrupt. */
1295 	for (index = 0; index < sc->rxq_count; index++) {
1296 		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1297 			goto fail;
1298 	}
1299 
1300 	sfxge_rx_stat_init(sc);
1301 
1302 	return (0);
1303 
1304 fail:
1305 	/* Tear down the receive queue(s). */
1306 	while (--index >= 0)
1307 		sfxge_rx_qfini(sc, index);
1308 
1309 	sc->rxq_count = 0;
1310 
1311 #ifdef SFXGE_LRO
1312 fail_lro_table_size:
1313 #endif
1314 	return (rc);
1315 }
1316