xref: /freebsd/sys/dev/sfxge/sfxge_rx.c (revision ec0e626bafb335b30c499d06066997f54b10c092)
1 /*-
2  * Copyright (c) 2010-2011 Solarflare Communications, Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/types.h>
34 #include <sys/mbuf.h>
35 #include <sys/smp.h>
36 #include <sys/socket.h>
37 #include <sys/sysctl.h>
38 #include <sys/limits.h>
39 #include <sys/syslog.h>
40 
41 #include <net/ethernet.h>
42 #include <net/if.h>
43 #include <net/if_vlan_var.h>
44 
45 #include <netinet/in.h>
46 #include <netinet/ip.h>
47 #include <netinet/ip6.h>
48 #include <netinet/tcp.h>
49 
50 #include <machine/in_cksum.h>
51 
52 #include "common/efx.h"
53 
54 
55 #include "sfxge.h"
56 #include "sfxge_rx.h"
57 
58 #define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
59 
60 #ifdef SFXGE_LRO
61 
62 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
63 	    "Large receive offload (LRO) parameters");
64 
65 #define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
66 
67 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
68  * means we can accelerate a larger number of streams.
69  */
70 static unsigned lro_table_size = 128;
71 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
72 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
73 	    &lro_table_size, 0,
74 	    "Size of the LRO hash table (must be a power of 2)");
75 
76 /* Maximum length of a hash chain.  If chains get too long then the lookup
77  * time increases and may exceed the benefit of LRO.
78  */
79 static unsigned lro_chain_max = 20;
80 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
81 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
82 	    &lro_chain_max, 0,
83 	    "The maximum length of a hash chain");
84 
85 /* Maximum time (in ticks) that a connection can be idle before it's LRO
86  * state is discarded.
87  */
88 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
89 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
90 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
91 	    &lro_idle_ticks, 0,
92 	    "The maximum time (in ticks) that a connection can be idle "
93 	    "before it's LRO state is discarded");
94 
95 /* Number of packets with payload that must arrive in-order before a
96  * connection is eligible for LRO.  The idea is we should avoid coalescing
97  * segments when the sender is in slow-start because reducing the ACK rate
98  * can damage performance.
99  */
100 static int lro_slow_start_packets = 2000;
101 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
102 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
103 	    &lro_slow_start_packets, 0,
104 	    "Number of packets with payload that must arrive in-order before "
105 	    "a connection is eligible for LRO");
106 
107 /* Number of packets with payload that must arrive in-order following loss
108  * before a connection is eligible for LRO.  The idea is we should avoid
109  * coalescing segments when the sender is recovering from loss, because
110  * reducing the ACK rate can damage performance.
111  */
112 static int lro_loss_packets = 20;
113 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
114 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
115 	    &lro_loss_packets, 0,
116 	    "Number of packets with payload that must arrive in-order "
117 	    "following loss before a connection is eligible for LRO");
118 
119 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
120 #define	SFXGE_LRO_L2_ID_VLAN 0x4000
121 #define	SFXGE_LRO_L2_ID_IPV6 0x8000
122 #define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
123 #define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
124 
125 /* Compare IPv6 addresses, avoiding conditional branches */
126 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
127 				   const struct in6_addr *right)
128 {
129 #if LONG_BIT == 64
130 	const uint64_t *left64 = (const uint64_t *)left;
131 	const uint64_t *right64 = (const uint64_t *)right;
132 	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
133 #else
134 	return (left->s6_addr32[0] - right->s6_addr32[0]) |
135 	       (left->s6_addr32[1] - right->s6_addr32[1]) |
136 	       (left->s6_addr32[2] - right->s6_addr32[2]) |
137 	       (left->s6_addr32[3] - right->s6_addr32[3]);
138 #endif
139 }
140 
141 #endif	/* SFXGE_LRO */
142 
143 void
144 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
145 {
146 
147 	rxq->flush_state = SFXGE_FLUSH_DONE;
148 }
149 
150 void
151 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
152 {
153 
154 	rxq->flush_state = SFXGE_FLUSH_FAILED;
155 }
156 
157 static uint8_t toep_key[] = {
158 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
159 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
160 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
161 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
162 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
163 };
164 
165 static void
166 sfxge_rx_post_refill(void *arg)
167 {
168 	struct sfxge_rxq *rxq = arg;
169 	struct sfxge_softc *sc;
170 	unsigned int index;
171 	struct sfxge_evq *evq;
172 	uint16_t magic;
173 
174 	sc = rxq->sc;
175 	index = rxq->index;
176 	evq = sc->evq[index];
177 
178 	magic = SFXGE_MAGIC_RX_QREFILL | index;
179 
180 	/* This is guaranteed due to the start/stop order of rx and ev */
181 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
182 	    ("evq not started"));
183 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
184 	    ("rxq not started"));
185 	efx_ev_qpost(evq->common, magic);
186 }
187 
188 static void
189 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
190 {
191 	/* Initially retry after 100 ms, but back off in case of
192 	 * repeated failures as we probably have to wait for the
193 	 * administrator to raise the pool limit. */
194 	if (retrying)
195 		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
196 	else
197 		rxq->refill_delay = hz / 10;
198 
199 	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
200 			     sfxge_rx_post_refill, rxq);
201 }
202 
203 static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
204 {
205 	struct mb_args args;
206 	struct mbuf *m;
207 
208 	/* Allocate mbuf structure */
209 	args.flags = M_PKTHDR;
210 	args.type = MT_DATA;
211 	m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
212 
213 	/* Allocate (and attach) packet buffer */
214 	if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
215 		uma_zfree(zone_mbuf, m);
216 		m = NULL;
217 	}
218 
219 	return (m);
220 }
221 
222 #define	SFXGE_REFILL_BATCH  64
223 
224 static void
225 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
226 {
227 	struct sfxge_softc *sc;
228 	unsigned int index;
229 	struct sfxge_evq *evq;
230 	unsigned int batch;
231 	unsigned int rxfill;
232 	unsigned int mblksize;
233 	int ntodo;
234 	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
235 
236 	sc = rxq->sc;
237 	index = rxq->index;
238 	evq = sc->evq[index];
239 
240 	prefetch_read_many(sc->enp);
241 	prefetch_read_many(rxq->common);
242 
243 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
244 
245 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
246 		return;
247 
248 	rxfill = rxq->added - rxq->completed;
249 	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
250 	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
251 	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
252 	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
253 	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
254 
255 	if (ntodo == 0)
256 		return;
257 
258 	batch = 0;
259 	mblksize = sc->rx_buffer_size;
260 	while (ntodo-- > 0) {
261 		unsigned int id;
262 		struct sfxge_rx_sw_desc *rx_desc;
263 		bus_dma_segment_t seg;
264 		struct mbuf *m;
265 
266 		id = (rxq->added + batch) & rxq->ptr_mask;
267 		rx_desc = &rxq->queue[id];
268 		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
269 
270 		rx_desc->flags = EFX_DISCARD;
271 		m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
272 		if (m == NULL)
273 			break;
274 		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
275 		addr[batch++] = seg.ds_addr;
276 
277 		if (batch == SFXGE_REFILL_BATCH) {
278 			efx_rx_qpost(rxq->common, addr, mblksize, batch,
279 			    rxq->completed, rxq->added);
280 			rxq->added += batch;
281 			batch = 0;
282 		}
283 	}
284 
285 	if (ntodo != 0)
286 		sfxge_rx_schedule_refill(rxq, retrying);
287 
288 	if (batch != 0) {
289 		efx_rx_qpost(rxq->common, addr, mblksize, batch,
290 		    rxq->completed, rxq->added);
291 		rxq->added += batch;
292 	}
293 
294 	/* Make the descriptors visible to the hardware */
295 	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
296 			BUS_DMASYNC_PREWRITE);
297 
298 	efx_rx_qpush(rxq->common, rxq->added);
299 }
300 
301 void
302 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
303 {
304 
305 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
306 		return;
307 
308 	/* Make sure the queue is full */
309 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
310 }
311 
312 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
313 {
314 	struct ifnet *ifp = sc->ifnet;
315 
316 	m->m_pkthdr.rcvif = ifp;
317 	m->m_pkthdr.csum_data = 0xffff;
318 	ifp->if_input(ifp, m);
319 }
320 
321 static void
322 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
323 {
324 	struct mbuf *m = rx_desc->mbuf;
325 	int csum_flags;
326 
327 	/* Convert checksum flags */
328 	csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ?
329 		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
330 	if (rx_desc->flags & EFX_CKSUM_TCPUDP)
331 		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
332 
333 #ifdef SFXGE_HAVE_MQ
334 	/* The hash covers a 4-tuple for TCP only */
335 	if (rx_desc->flags & EFX_PKT_TCP) {
336 		m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
337 						       mtod(m, uint8_t *));
338 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
339 	}
340 #endif
341 	m->m_data += sc->rx_prefix_size;
342 	m->m_len = rx_desc->size - sc->rx_prefix_size;
343 	m->m_pkthdr.len = m->m_len;
344 	m->m_pkthdr.csum_flags = csum_flags;
345 	__sfxge_rx_deliver(sc, rx_desc->mbuf);
346 
347 	rx_desc->flags = EFX_DISCARD;
348 	rx_desc->mbuf = NULL;
349 }
350 
351 #ifdef SFXGE_LRO
352 
353 static void
354 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
355 {
356 	struct sfxge_softc *sc = st->sc;
357 	struct mbuf *m = c->mbuf;
358 	struct tcphdr *c_th;
359 	int csum_flags;
360 
361 	KASSERT(m, ("no mbuf to deliver"));
362 
363 	++st->n_bursts;
364 
365 	/* Finish off packet munging and recalculate IP header checksum. */
366 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
367 		struct ip *iph = c->nh;
368 		iph->ip_len = htons(iph->ip_len);
369 		iph->ip_sum = 0;
370 		iph->ip_sum = in_cksum_hdr(iph);
371 		c_th = (struct tcphdr *)(iph + 1);
372 		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
373 			      CSUM_IP_CHECKED | CSUM_IP_VALID);
374 	} else {
375 		struct ip6_hdr *iph = c->nh;
376 		iph->ip6_plen = htons(iph->ip6_plen);
377 		c_th = (struct tcphdr *)(iph + 1);
378 		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
379 	}
380 
381 	c_th->th_win = c->th_last->th_win;
382 	c_th->th_ack = c->th_last->th_ack;
383 	if (c_th->th_off == c->th_last->th_off) {
384 		/* Copy TCP options (take care to avoid going negative). */
385 		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
386 		memcpy(c_th + 1, c->th_last + 1, optlen);
387 	}
388 
389 #ifdef SFXGE_HAVE_MQ
390 	m->m_pkthdr.flowid = c->conn_hash;
391 	M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
392 #endif
393 	m->m_pkthdr.csum_flags = csum_flags;
394 	__sfxge_rx_deliver(sc, m);
395 
396 	c->mbuf = NULL;
397 	c->delivered = 1;
398 }
399 
400 /* Drop the given connection, and add it to the free list. */
401 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
402 {
403 	unsigned bucket;
404 
405 	KASSERT(!c->mbuf, ("found orphaned mbuf"));
406 
407 	if (c->next_buf.mbuf != NULL) {
408 		sfxge_rx_deliver(rxq->sc, &c->next_buf);
409 		LIST_REMOVE(c, active_link);
410 	}
411 
412 	bucket = c->conn_hash & rxq->lro.conns_mask;
413 	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
414 	--rxq->lro.conns_n[bucket];
415 	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
416 	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
417 }
418 
419 /* Stop tracking connections that have gone idle in order to keep hash
420  * chains short.
421  */
422 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
423 {
424 	struct sfxge_lro_conn *c;
425 	unsigned i;
426 
427 	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
428 		("found active connections"));
429 
430 	rxq->lro.last_purge_ticks = now;
431 	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
432 		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
433 			continue;
434 
435 		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
436 		if (now - c->last_pkt_ticks > lro_idle_ticks) {
437 			++rxq->lro.n_drop_idle;
438 			sfxge_lro_drop(rxq, c);
439 		}
440 	}
441 }
442 
443 static void
444 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
445 		struct mbuf *mbuf, struct tcphdr *th)
446 {
447 	struct tcphdr *c_th;
448 
449 	/* Tack the new mbuf onto the chain. */
450 	KASSERT(!mbuf->m_next, ("mbuf already chained"));
451 	c->mbuf_tail->m_next = mbuf;
452 	c->mbuf_tail = mbuf;
453 
454 	/* Increase length appropriately */
455 	c->mbuf->m_pkthdr.len += mbuf->m_len;
456 
457 	/* Update the connection state flags */
458 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
459 		struct ip *iph = c->nh;
460 		iph->ip_len += mbuf->m_len;
461 		c_th = (struct tcphdr *)(iph + 1);
462 	} else {
463 		struct ip6_hdr *iph = c->nh;
464 		iph->ip6_plen += mbuf->m_len;
465 		c_th = (struct tcphdr *)(iph + 1);
466 	}
467 	c_th->th_flags |= (th->th_flags & TH_PUSH);
468 	c->th_last = th;
469 	++st->n_merges;
470 
471 	/* Pass packet up now if another segment could overflow the IP
472 	 * length.
473 	 */
474 	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
475 		sfxge_lro_deliver(st, c);
476 }
477 
478 static void
479 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
480 		struct mbuf *mbuf, void *nh, struct tcphdr *th)
481 {
482 	/* Start the chain */
483 	c->mbuf = mbuf;
484 	c->mbuf_tail = c->mbuf;
485 	c->nh = nh;
486 	c->th_last = th;
487 
488 	mbuf->m_pkthdr.len = mbuf->m_len;
489 
490 	/* Mangle header fields for later processing */
491 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
492 		struct ip *iph = nh;
493 		iph->ip_len = ntohs(iph->ip_len);
494 	} else {
495 		struct ip6_hdr *iph = nh;
496 		iph->ip6_plen = ntohs(iph->ip6_plen);
497 	}
498 }
499 
500 /* Try to merge or otherwise hold or deliver (as appropriate) the
501  * packet buffered for this connection (c->next_buf).  Return a flag
502  * indicating whether the connection is still active for LRO purposes.
503  */
504 static int
505 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
506 {
507 	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
508 	char *eh = c->next_eh;
509 	int data_length, hdr_length, dont_merge;
510 	unsigned th_seq, pkt_length;
511 	struct tcphdr *th;
512 	unsigned now;
513 
514 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
515 		struct ip *iph = c->next_nh;
516 		th = (struct tcphdr *)(iph + 1);
517 		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
518 	} else {
519 		struct ip6_hdr *iph = c->next_nh;
520 		th = (struct tcphdr *)(iph + 1);
521 		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
522 	}
523 
524 	hdr_length = (char *) th + th->th_off * 4 - eh;
525 	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
526 		       hdr_length);
527 	th_seq = ntohl(th->th_seq);
528 	dont_merge = ((data_length <= 0)
529 		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
530 
531 	/* Check for options other than aligned timestamp. */
532 	if (th->th_off != 5) {
533 		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
534 		if (th->th_off == 8 &&
535 		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
536 					(TCPOPT_NOP << 16) |
537 					(TCPOPT_TIMESTAMP << 8) |
538 					TCPOLEN_TIMESTAMP)) {
539 			/* timestamp option -- okay */
540 		} else {
541 			dont_merge = 1;
542 		}
543 	}
544 
545 	if (__predict_false(th_seq != c->next_seq)) {
546 		/* Out-of-order, so start counting again. */
547 		if (c->mbuf != NULL)
548 			sfxge_lro_deliver(&rxq->lro, c);
549 		c->n_in_order_pkts -= lro_loss_packets;
550 		c->next_seq = th_seq + data_length;
551 		++rxq->lro.n_misorder;
552 		goto deliver_buf_out;
553 	}
554 	c->next_seq = th_seq + data_length;
555 
556 	now = ticks;
557 	if (now - c->last_pkt_ticks > lro_idle_ticks) {
558 		++rxq->lro.n_drop_idle;
559 		if (c->mbuf != NULL)
560 			sfxge_lro_deliver(&rxq->lro, c);
561 		sfxge_lro_drop(rxq, c);
562 		return (0);
563 	}
564 	c->last_pkt_ticks = ticks;
565 
566 	if (c->n_in_order_pkts < lro_slow_start_packets) {
567 		/* May be in slow-start, so don't merge. */
568 		++rxq->lro.n_slow_start;
569 		++c->n_in_order_pkts;
570 		goto deliver_buf_out;
571 	}
572 
573 	if (__predict_false(dont_merge)) {
574 		if (c->mbuf != NULL)
575 			sfxge_lro_deliver(&rxq->lro, c);
576 		if (th->th_flags & (TH_FIN | TH_RST)) {
577 			++rxq->lro.n_drop_closed;
578 			sfxge_lro_drop(rxq, c);
579 			return (0);
580 		}
581 		goto deliver_buf_out;
582 	}
583 
584 	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
585 
586 	if (__predict_true(c->mbuf != NULL)) {
587 		/* Remove headers and any padding */
588 		rx_buf->mbuf->m_data += hdr_length;
589 		rx_buf->mbuf->m_len = data_length;
590 
591 		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
592 	} else {
593 		/* Remove any padding */
594 		rx_buf->mbuf->m_len = pkt_length;
595 
596 		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
597 	}
598 
599 	rx_buf->mbuf = NULL;
600 	return (1);
601 
602  deliver_buf_out:
603 	sfxge_rx_deliver(rxq->sc, rx_buf);
604 	return (1);
605 }
606 
607 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
608 			       uint16_t l2_id, void *nh, struct tcphdr *th)
609 {
610 	unsigned bucket = conn_hash & st->conns_mask;
611 	struct sfxge_lro_conn *c;
612 
613 	if (st->conns_n[bucket] >= lro_chain_max) {
614 		++st->n_too_many;
615 		return;
616 	}
617 
618 	if (!TAILQ_EMPTY(&st->free_conns)) {
619 		c = TAILQ_FIRST(&st->free_conns);
620 		TAILQ_REMOVE(&st->free_conns, c, link);
621 	} else {
622 		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
623 		if (c == NULL)
624 			return;
625 		c->mbuf = NULL;
626 		c->next_buf.mbuf = NULL;
627 	}
628 
629 	/* Create the connection tracking data */
630 	++st->conns_n[bucket];
631 	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
632 	c->l2_id = l2_id;
633 	c->conn_hash = conn_hash;
634 	c->source = th->th_sport;
635 	c->dest = th->th_dport;
636 	c->n_in_order_pkts = 0;
637 	c->last_pkt_ticks = *(volatile int *)&ticks;
638 	c->delivered = 0;
639 	++st->n_new_stream;
640 	/* NB. We don't initialise c->next_seq, and it doesn't matter what
641 	 * value it has.  Most likely the next packet received for this
642 	 * connection will not match -- no harm done.
643 	 */
644 }
645 
646 /* Process mbuf and decide whether to dispatch it to the stack now or
647  * later.
648  */
649 static void
650 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
651 {
652 	struct sfxge_softc *sc = rxq->sc;
653 	struct mbuf *m = rx_buf->mbuf;
654 	struct ether_header *eh;
655 	struct sfxge_lro_conn *c;
656 	uint16_t l2_id;
657 	uint16_t l3_proto;
658 	void *nh;
659 	struct tcphdr *th;
660 	uint32_t conn_hash;
661 	unsigned bucket;
662 
663 	/* Get the hardware hash */
664 	conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
665 				      mtod(m, uint8_t *));
666 
667 	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
668 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
669 		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
670 		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
671 			SFXGE_LRO_L2_ID_VLAN;
672 		l3_proto = veh->evl_proto;
673 		nh = veh + 1;
674 	} else {
675 		l2_id = 0;
676 		l3_proto = eh->ether_type;
677 		nh = eh + 1;
678 	}
679 
680 	/* Check whether this is a suitable packet (unfragmented
681 	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
682 	 * length, and compute a hash if necessary.  If not, return.
683 	 */
684 	if (l3_proto == htons(ETHERTYPE_IP)) {
685 		struct ip *iph = nh;
686 		if ((iph->ip_p - IPPROTO_TCP) |
687 		    (iph->ip_hl - (sizeof(*iph) >> 2u)) |
688 		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
689 			goto deliver_now;
690 		th = (struct tcphdr *)(iph + 1);
691 	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
692 		struct ip6_hdr *iph = nh;
693 		if (iph->ip6_nxt != IPPROTO_TCP)
694 			goto deliver_now;
695 		l2_id |= SFXGE_LRO_L2_ID_IPV6;
696 		th = (struct tcphdr *)(iph + 1);
697 	} else {
698 		goto deliver_now;
699 	}
700 
701 	bucket = conn_hash & rxq->lro.conns_mask;
702 
703 	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
704 		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
705 			continue;
706 		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
707 			continue;
708 		if (c->mbuf != NULL) {
709 			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
710 				struct ip *c_iph, *iph = nh;
711 				c_iph = c->nh;
712 				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
713 				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
714 					continue;
715 			} else {
716 				struct ip6_hdr *c_iph, *iph = nh;
717 				c_iph = c->nh;
718 				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
719 				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
720 					continue;
721 			}
722 		}
723 
724 		/* Re-insert at head of list to reduce lookup time. */
725 		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
726 		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
727 
728 		if (c->next_buf.mbuf != NULL) {
729 			if (!sfxge_lro_try_merge(rxq, c))
730 				goto deliver_now;
731 		} else {
732 			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
733 			    active_link);
734 		}
735 		c->next_buf = *rx_buf;
736 		c->next_eh = eh;
737 		c->next_nh = nh;
738 
739 		rx_buf->mbuf = NULL;
740 		rx_buf->flags = EFX_DISCARD;
741 		return;
742 	}
743 
744 	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
745  deliver_now:
746 	sfxge_rx_deliver(sc, rx_buf);
747 }
748 
749 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
750 {
751 	struct sfxge_lro_state *st = &rxq->lro;
752 	struct sfxge_lro_conn *c;
753 	unsigned t;
754 
755 	while (!LIST_EMPTY(&st->active_conns)) {
756 		c = LIST_FIRST(&st->active_conns);
757 		if (!c->delivered && c->mbuf != NULL)
758 			sfxge_lro_deliver(st, c);
759 		if (sfxge_lro_try_merge(rxq, c)) {
760 			if (c->mbuf != NULL)
761 				sfxge_lro_deliver(st, c);
762 			LIST_REMOVE(c, active_link);
763 		}
764 		c->delivered = 0;
765 	}
766 
767 	t = *(volatile int *)&ticks;
768 	if (__predict_false(t != st->last_purge_ticks))
769 		sfxge_lro_purge_idle(rxq, t);
770 }
771 
772 #else	/* !SFXGE_LRO */
773 
774 static void
775 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
776 {
777 }
778 
779 static void
780 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
781 {
782 }
783 
784 #endif	/* SFXGE_LRO */
785 
786 void
787 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
788 {
789 	struct sfxge_softc *sc = rxq->sc;
790 	int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
791 	unsigned int index;
792 	struct sfxge_evq *evq;
793 	unsigned int completed;
794 	unsigned int level;
795 	struct mbuf *m;
796 	struct sfxge_rx_sw_desc *prev = NULL;
797 
798 	index = rxq->index;
799 	evq = sc->evq[index];
800 
801 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
802 
803 	completed = rxq->completed;
804 	while (completed != rxq->pending) {
805 		unsigned int id;
806 		struct sfxge_rx_sw_desc *rx_desc;
807 
808 		id = completed++ & rxq->ptr_mask;
809 		rx_desc = &rxq->queue[id];
810 		m = rx_desc->mbuf;
811 
812 		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
813 			goto discard;
814 
815 		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
816 			goto discard;
817 
818 		prefetch_read_many(mtod(m, caddr_t));
819 
820 		/* Check for loopback packets */
821 		if (!(rx_desc->flags & EFX_PKT_IPV4) &&
822 		    !(rx_desc->flags & EFX_PKT_IPV6)) {
823 			struct ether_header *etherhp;
824 
825 			/*LINTED*/
826 			etherhp = mtod(m, struct ether_header *);
827 
828 			if (etherhp->ether_type ==
829 			    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
830 				EFSYS_PROBE(loopback);
831 
832 				rxq->loopback++;
833 				goto discard;
834 			}
835 		}
836 
837 		/* Pass packet up the stack or into LRO (pipelined) */
838 		if (prev != NULL) {
839 			if (lro_enabled)
840 				sfxge_lro(rxq, prev);
841 			else
842 				sfxge_rx_deliver(sc, prev);
843 		}
844 		prev = rx_desc;
845 		continue;
846 
847 discard:
848 		/* Return the packet to the pool */
849 		m_free(m);
850 		rx_desc->mbuf = NULL;
851 	}
852 	rxq->completed = completed;
853 
854 	level = rxq->added - rxq->completed;
855 
856 	/* Pass last packet up the stack or into LRO */
857 	if (prev != NULL) {
858 		if (lro_enabled)
859 			sfxge_lro(rxq, prev);
860 		else
861 			sfxge_rx_deliver(sc, prev);
862 	}
863 
864 	/*
865 	 * If there are any pending flows and this is the end of the
866 	 * poll then they must be completed.
867 	 */
868 	if (eop)
869 		sfxge_lro_end_of_burst(rxq);
870 
871 	/* Top up the queue if necessary */
872 	if (level < rxq->refill_threshold)
873 		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
874 }
875 
876 static void
877 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
878 {
879 	struct sfxge_rxq *rxq;
880 	struct sfxge_evq *evq;
881 	unsigned int count;
882 
883 	rxq = sc->rxq[index];
884 	evq = sc->evq[index];
885 
886 	SFXGE_EVQ_LOCK(evq);
887 
888 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
889 	    ("rxq not started"));
890 
891 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
892 
893 	callout_stop(&rxq->refill_callout);
894 
895 again:
896 	rxq->flush_state = SFXGE_FLUSH_PENDING;
897 
898 	/* Flush the receive queue */
899 	efx_rx_qflush(rxq->common);
900 
901 	SFXGE_EVQ_UNLOCK(evq);
902 
903 	count = 0;
904 	do {
905 		/* Spin for 100 ms */
906 		DELAY(100000);
907 
908 		if (rxq->flush_state != SFXGE_FLUSH_PENDING)
909 			break;
910 
911 	} while (++count < 20);
912 
913 	SFXGE_EVQ_LOCK(evq);
914 
915 	if (rxq->flush_state == SFXGE_FLUSH_FAILED)
916 		goto again;
917 
918 	rxq->flush_state = SFXGE_FLUSH_DONE;
919 
920 	rxq->pending = rxq->added;
921 	sfxge_rx_qcomplete(rxq, B_TRUE);
922 
923 	KASSERT(rxq->completed == rxq->pending,
924 	    ("rxq->completed != rxq->pending"));
925 
926 	rxq->added = 0;
927 	rxq->pending = 0;
928 	rxq->completed = 0;
929 	rxq->loopback = 0;
930 
931 	/* Destroy the common code receive queue. */
932 	efx_rx_qdestroy(rxq->common);
933 
934 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
935 	    EFX_RXQ_NBUFS(sc->rxq_entries));
936 
937 	SFXGE_EVQ_UNLOCK(evq);
938 }
939 
940 static int
941 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
942 {
943 	struct sfxge_rxq *rxq;
944 	efsys_mem_t *esmp;
945 	struct sfxge_evq *evq;
946 	int rc;
947 
948 	rxq = sc->rxq[index];
949 	esmp = &rxq->mem;
950 	evq = sc->evq[index];
951 
952 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
953 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
954 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
955 	    ("evq->init_state != SFXGE_EVQ_STARTED"));
956 
957 	/* Program the buffer table. */
958 	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
959 	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
960 		return (rc);
961 
962 	/* Create the common code receive queue. */
963 	if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
964 	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
965 	    &rxq->common)) != 0)
966 		goto fail;
967 
968 	SFXGE_EVQ_LOCK(evq);
969 
970 	/* Enable the receive queue. */
971 	efx_rx_qenable(rxq->common);
972 
973 	rxq->init_state = SFXGE_RXQ_STARTED;
974 
975 	/* Try to fill the queue from the pool. */
976 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
977 
978 	SFXGE_EVQ_UNLOCK(evq);
979 
980 	return (0);
981 
982 fail:
983 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
984 	    EFX_RXQ_NBUFS(sc->rxq_entries));
985 	return (rc);
986 }
987 
988 void
989 sfxge_rx_stop(struct sfxge_softc *sc)
990 {
991 	int index;
992 
993 	/* Stop the receive queue(s) */
994 	index = sc->rxq_count;
995 	while (--index >= 0)
996 		sfxge_rx_qstop(sc, index);
997 
998 	sc->rx_prefix_size = 0;
999 	sc->rx_buffer_size = 0;
1000 
1001 	efx_rx_fini(sc->enp);
1002 }
1003 
1004 int
1005 sfxge_rx_start(struct sfxge_softc *sc)
1006 {
1007 	struct sfxge_intr *intr;
1008 	int index;
1009 	int rc;
1010 
1011 	intr = &sc->intr;
1012 
1013 	/* Initialize the common code receive module. */
1014 	if ((rc = efx_rx_init(sc->enp)) != 0)
1015 		return (rc);
1016 
1017 	/* Calculate the receive packet buffer size. */
1018 	sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
1019 	sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
1020 			      sc->rx_prefix_size);
1021 
1022 	/* Select zone for packet buffers */
1023 	if (sc->rx_buffer_size <= MCLBYTES)
1024 		sc->rx_buffer_zone = zone_clust;
1025 	else if (sc->rx_buffer_size <= MJUMPAGESIZE)
1026 		sc->rx_buffer_zone = zone_jumbop;
1027 	else if (sc->rx_buffer_size <= MJUM9BYTES)
1028 		sc->rx_buffer_zone = zone_jumbo9;
1029 	else
1030 		sc->rx_buffer_zone = zone_jumbo16;
1031 
1032 	/*
1033 	 * Set up the scale table.  Enable all hash types and hash insertion.
1034 	 */
1035 	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1036 		sc->rx_indir_table[index] = index % sc->rxq_count;
1037 	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1038 				       SFXGE_RX_SCALE_MAX)) != 0)
1039 		goto fail;
1040 	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1041 	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1042 	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1043 
1044 	if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1045 	    sizeof(toep_key))) != 0)
1046 		goto fail;
1047 
1048 	/* Start the receive queue(s). */
1049 	for (index = 0; index < sc->rxq_count; index++) {
1050 		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1051 			goto fail2;
1052 	}
1053 
1054 	return (0);
1055 
1056 fail2:
1057 	while (--index >= 0)
1058 		sfxge_rx_qstop(sc, index);
1059 
1060 fail:
1061 	efx_rx_fini(sc->enp);
1062 
1063 	return (rc);
1064 }
1065 
1066 #ifdef SFXGE_LRO
1067 
1068 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1069 {
1070 	struct sfxge_lro_state *st = &rxq->lro;
1071 	unsigned i;
1072 
1073 	st->conns_mask = lro_table_size - 1;
1074 	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1075 		("lro_table_size must be a power of 2"));
1076 	st->sc = rxq->sc;
1077 	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1078 			   M_SFXGE, M_WAITOK);
1079 	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1080 			     M_SFXGE, M_WAITOK);
1081 	for (i = 0; i <= st->conns_mask; ++i) {
1082 		TAILQ_INIT(&st->conns[i]);
1083 		st->conns_n[i] = 0;
1084 	}
1085 	LIST_INIT(&st->active_conns);
1086 	TAILQ_INIT(&st->free_conns);
1087 }
1088 
1089 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1090 {
1091 	struct sfxge_lro_state *st = &rxq->lro;
1092 	struct sfxge_lro_conn *c;
1093 	unsigned i;
1094 
1095 	/* Return cleanly if sfxge_lro_init() has not been called. */
1096 	if (st->conns == NULL)
1097 		return;
1098 
1099 	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1100 
1101 	for (i = 0; i <= st->conns_mask; ++i) {
1102 		while (!TAILQ_EMPTY(&st->conns[i])) {
1103 			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1104 			sfxge_lro_drop(rxq, c);
1105 		}
1106 	}
1107 
1108 	while (!TAILQ_EMPTY(&st->free_conns)) {
1109 		c = TAILQ_FIRST(&st->free_conns);
1110 		TAILQ_REMOVE(&st->free_conns, c, link);
1111 		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1112 		free(c, M_SFXGE);
1113 	}
1114 
1115 	free(st->conns_n, M_SFXGE);
1116 	free(st->conns, M_SFXGE);
1117 	st->conns = NULL;
1118 }
1119 
1120 #else
1121 
1122 static void
1123 sfxge_lro_init(struct sfxge_rxq *rxq)
1124 {
1125 }
1126 
1127 static void
1128 sfxge_lro_fini(struct sfxge_rxq *rxq)
1129 {
1130 }
1131 
1132 #endif	/* SFXGE_LRO */
1133 
1134 static void
1135 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1136 {
1137 	struct sfxge_rxq *rxq;
1138 
1139 	rxq = sc->rxq[index];
1140 
1141 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1142 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1143 
1144 	/* Free the context array and the flow table. */
1145 	free(rxq->queue, M_SFXGE);
1146 	sfxge_lro_fini(rxq);
1147 
1148 	/* Release DMA memory. */
1149 	sfxge_dma_free(&rxq->mem);
1150 
1151 	sc->rxq[index] = NULL;
1152 
1153 	free(rxq, M_SFXGE);
1154 }
1155 
1156 static int
1157 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1158 {
1159 	struct sfxge_rxq *rxq;
1160 	struct sfxge_evq *evq;
1161 	efsys_mem_t *esmp;
1162 	int rc;
1163 
1164 	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1165 
1166 	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1167 	rxq->sc = sc;
1168 	rxq->index = index;
1169 	rxq->entries = sc->rxq_entries;
1170 	rxq->ptr_mask = rxq->entries - 1;
1171 	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1172 
1173 	sc->rxq[index] = rxq;
1174 	esmp = &rxq->mem;
1175 
1176 	evq = sc->evq[index];
1177 
1178 	/* Allocate and zero DMA space. */
1179 	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1180 		return (rc);
1181 
1182 	/* Allocate buffer table entries. */
1183 	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1184 				 &rxq->buf_base_id);
1185 
1186 	/* Allocate the context array and the flow table. */
1187 	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1188 	    M_SFXGE, M_WAITOK | M_ZERO);
1189 	sfxge_lro_init(rxq);
1190 
1191 	callout_init(&rxq->refill_callout, B_TRUE);
1192 
1193 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1194 
1195 	return (0);
1196 }
1197 
1198 static const struct {
1199 	const char *name;
1200 	size_t offset;
1201 } sfxge_rx_stats[] = {
1202 #define	SFXGE_RX_STAT(name, member) \
1203 	{ #name, offsetof(struct sfxge_rxq, member) }
1204 #ifdef SFXGE_LRO
1205 	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1206 	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1207 	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1208 	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1209 	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1210 	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1211 	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1212 	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1213 #endif
1214 };
1215 
1216 static int
1217 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1218 {
1219 	struct sfxge_softc *sc = arg1;
1220 	unsigned int id = arg2;
1221 	unsigned int sum, index;
1222 
1223 	/* Sum across all RX queues */
1224 	sum = 0;
1225 	for (index = 0; index < sc->rxq_count; index++)
1226 		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1227 					 sfxge_rx_stats[id].offset);
1228 
1229 	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1230 }
1231 
1232 static void
1233 sfxge_rx_stat_init(struct sfxge_softc *sc)
1234 {
1235 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1236 	struct sysctl_oid_list *stat_list;
1237 	unsigned int id;
1238 
1239 	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1240 
1241 	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1242 		SYSCTL_ADD_PROC(
1243 			ctx, stat_list,
1244 			OID_AUTO, sfxge_rx_stats[id].name,
1245 			CTLTYPE_UINT|CTLFLAG_RD,
1246 			sc, id, sfxge_rx_stat_handler, "IU",
1247 			"");
1248 	}
1249 }
1250 
1251 void
1252 sfxge_rx_fini(struct sfxge_softc *sc)
1253 {
1254 	int index;
1255 
1256 	index = sc->rxq_count;
1257 	while (--index >= 0)
1258 		sfxge_rx_qfini(sc, index);
1259 
1260 	sc->rxq_count = 0;
1261 }
1262 
1263 int
1264 sfxge_rx_init(struct sfxge_softc *sc)
1265 {
1266 	struct sfxge_intr *intr;
1267 	int index;
1268 	int rc;
1269 
1270 #ifdef SFXGE_LRO
1271 	if (!ISP2(lro_table_size)) {
1272 		log(LOG_ERR, "%s=%u must be power of 2",
1273 		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1274 		rc = EINVAL;
1275 		goto fail_lro_table_size;
1276 	}
1277 
1278 	if (lro_idle_ticks == 0)
1279 		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1280 #endif
1281 
1282 	intr = &sc->intr;
1283 
1284 	sc->rxq_count = intr->n_alloc;
1285 
1286 	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1287 	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1288 
1289 	/* Initialize the receive queue(s) - one per interrupt. */
1290 	for (index = 0; index < sc->rxq_count; index++) {
1291 		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1292 			goto fail;
1293 	}
1294 
1295 	sfxge_rx_stat_init(sc);
1296 
1297 	return (0);
1298 
1299 fail:
1300 	/* Tear down the receive queue(s). */
1301 	while (--index >= 0)
1302 		sfxge_rx_qfini(sc, index);
1303 
1304 	sc->rxq_count = 0;
1305 
1306 #ifdef SFXGE_LRO
1307 fail_lro_table_size:
1308 #endif
1309 	return (rc);
1310 }
1311