xref: /freebsd/sys/dev/sfxge/sfxge_rx.c (revision cdc58367265a2bd6e8f913db2bdc591699ee229f)
1 /*-
2  * Copyright (c) 2010-2011 Solarflare Communications, Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/types.h>
34 #include <sys/mbuf.h>
35 #include <sys/smp.h>
36 #include <sys/socket.h>
37 #include <sys/sysctl.h>
38 #include <sys/limits.h>
39 #include <sys/syslog.h>
40 
41 #include <net/ethernet.h>
42 #include <net/if.h>
43 #include <net/if_vlan_var.h>
44 
45 #include <netinet/in.h>
46 #include <netinet/ip.h>
47 #include <netinet/ip6.h>
48 #include <netinet/tcp.h>
49 
50 #include <machine/in_cksum.h>
51 
52 #include "common/efx.h"
53 
54 
55 #include "sfxge.h"
56 #include "sfxge_rx.h"
57 
58 #define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
59 
60 #ifdef SFXGE_LRO
61 
62 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
63 	    "Large receive offload (LRO) parameters");
64 
65 #define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
66 
67 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
68  * means we can accelerate a larger number of streams.
69  */
70 static unsigned lro_table_size = 128;
71 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
72 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
73 	    &lro_table_size, 0,
74 	    "Size of the LRO hash table (must be a power of 2)");
75 
76 /* Maximum length of a hash chain.  If chains get too long then the lookup
77  * time increases and may exceed the benefit of LRO.
78  */
79 static unsigned lro_chain_max = 20;
80 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
81 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
82 	    &lro_chain_max, 0,
83 	    "The maximum length of a hash chain");
84 
85 /* Maximum time (in ticks) that a connection can be idle before it's LRO
86  * state is discarded.
87  */
88 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
89 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
90 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
91 	    &lro_idle_ticks, 0,
92 	    "The maximum time (in ticks) that a connection can be idle "
93 	    "before it's LRO state is discarded");
94 
95 /* Number of packets with payload that must arrive in-order before a
96  * connection is eligible for LRO.  The idea is we should avoid coalescing
97  * segments when the sender is in slow-start because reducing the ACK rate
98  * can damage performance.
99  */
100 static int lro_slow_start_packets = 2000;
101 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
102 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
103 	    &lro_slow_start_packets, 0,
104 	    "Number of packets with payload that must arrive in-order before "
105 	    "a connection is eligible for LRO");
106 
107 /* Number of packets with payload that must arrive in-order following loss
108  * before a connection is eligible for LRO.  The idea is we should avoid
109  * coalescing segments when the sender is recovering from loss, because
110  * reducing the ACK rate can damage performance.
111  */
112 static int lro_loss_packets = 20;
113 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
114 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
115 	    &lro_loss_packets, 0,
116 	    "Number of packets with payload that must arrive in-order "
117 	    "following loss before a connection is eligible for LRO");
118 
119 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
120 #define	SFXGE_LRO_L2_ID_VLAN 0x4000
121 #define	SFXGE_LRO_L2_ID_IPV6 0x8000
122 #define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
123 #define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
124 
125 /* Compare IPv6 addresses, avoiding conditional branches */
126 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
127 				   const struct in6_addr *right)
128 {
129 #if LONG_BIT == 64
130 	const uint64_t *left64 = (const uint64_t *)left;
131 	const uint64_t *right64 = (const uint64_t *)right;
132 	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
133 #else
134 	return (left->s6_addr32[0] - right->s6_addr32[0]) |
135 	       (left->s6_addr32[1] - right->s6_addr32[1]) |
136 	       (left->s6_addr32[2] - right->s6_addr32[2]) |
137 	       (left->s6_addr32[3] - right->s6_addr32[3]);
138 #endif
139 }
140 
141 #endif	/* SFXGE_LRO */
142 
143 void
144 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
145 {
146 
147 	rxq->flush_state = SFXGE_FLUSH_DONE;
148 }
149 
150 void
151 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
152 {
153 
154 	rxq->flush_state = SFXGE_FLUSH_FAILED;
155 }
156 
157 static uint8_t toep_key[] = {
158 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
159 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
160 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
161 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
162 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
163 };
164 
165 static void
166 sfxge_rx_post_refill(void *arg)
167 {
168 	struct sfxge_rxq *rxq = arg;
169 	struct sfxge_softc *sc;
170 	unsigned int index;
171 	struct sfxge_evq *evq;
172 	uint16_t magic;
173 
174 	sc = rxq->sc;
175 	index = rxq->index;
176 	evq = sc->evq[index];
177 
178 	magic = SFXGE_MAGIC_RX_QREFILL | index;
179 
180 	/* This is guaranteed due to the start/stop order of rx and ev */
181 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
182 	    ("evq not started"));
183 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
184 	    ("rxq not started"));
185 	efx_ev_qpost(evq->common, magic);
186 }
187 
188 static void
189 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
190 {
191 	/* Initially retry after 100 ms, but back off in case of
192 	 * repeated failures as we probably have to wait for the
193 	 * administrator to raise the pool limit. */
194 	if (retrying)
195 		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
196 	else
197 		rxq->refill_delay = hz / 10;
198 
199 	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
200 			     sfxge_rx_post_refill, rxq);
201 }
202 
203 static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
204 {
205 	struct mb_args args;
206 	struct mbuf *m;
207 
208 	/* Allocate mbuf structure */
209 	args.flags = M_PKTHDR;
210 	args.type = MT_DATA;
211 	m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
212 
213 	/* Allocate (and attach) packet buffer */
214 	if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
215 		uma_zfree(zone_mbuf, m);
216 		m = NULL;
217 	}
218 
219 	return (m);
220 }
221 
222 #define	SFXGE_REFILL_BATCH  64
223 
224 static void
225 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
226 {
227 	struct sfxge_softc *sc;
228 	unsigned int index;
229 	struct sfxge_evq *evq;
230 	unsigned int batch;
231 	unsigned int rxfill;
232 	unsigned int mblksize;
233 	int ntodo;
234 	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
235 
236 	sc = rxq->sc;
237 	index = rxq->index;
238 	evq = sc->evq[index];
239 
240 	prefetch_read_many(sc->enp);
241 	prefetch_read_many(rxq->common);
242 
243 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
244 
245 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
246 		return;
247 
248 	rxfill = rxq->added - rxq->completed;
249 	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
250 	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
251 	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
252 	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
253 	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
254 
255 	if (ntodo == 0)
256 		return;
257 
258 	batch = 0;
259 	mblksize = sc->rx_buffer_size;
260 	while (ntodo-- > 0) {
261 		unsigned int id;
262 		struct sfxge_rx_sw_desc *rx_desc;
263 		bus_dma_segment_t seg;
264 		struct mbuf *m;
265 
266 		id = (rxq->added + batch) & rxq->ptr_mask;
267 		rx_desc = &rxq->queue[id];
268 		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
269 
270 		rx_desc->flags = EFX_DISCARD;
271 		m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
272 		if (m == NULL)
273 			break;
274 		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
275 		addr[batch++] = seg.ds_addr;
276 
277 		if (batch == SFXGE_REFILL_BATCH) {
278 			efx_rx_qpost(rxq->common, addr, mblksize, batch,
279 			    rxq->completed, rxq->added);
280 			rxq->added += batch;
281 			batch = 0;
282 		}
283 	}
284 
285 	if (ntodo != 0)
286 		sfxge_rx_schedule_refill(rxq, retrying);
287 
288 	if (batch != 0) {
289 		efx_rx_qpost(rxq->common, addr, mblksize, batch,
290 		    rxq->completed, rxq->added);
291 		rxq->added += batch;
292 	}
293 
294 	/* Make the descriptors visible to the hardware */
295 	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
296 			BUS_DMASYNC_PREWRITE);
297 
298 	efx_rx_qpush(rxq->common, rxq->added);
299 }
300 
301 void
302 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
303 {
304 
305 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
306 		return;
307 
308 	/* Make sure the queue is full */
309 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
310 }
311 
312 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
313 {
314 	struct ifnet *ifp = sc->ifnet;
315 
316 	m->m_pkthdr.rcvif = ifp;
317 	m->m_pkthdr.csum_data = 0xffff;
318 	ifp->if_input(ifp, m);
319 }
320 
321 static void
322 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
323 {
324 	struct mbuf *m = rx_desc->mbuf;
325 	int csum_flags;
326 
327 	/* Convert checksum flags */
328 	csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ?
329 		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
330 	if (rx_desc->flags & EFX_CKSUM_TCPUDP)
331 		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
332 
333 	/* The hash covers a 4-tuple for TCP only */
334 	if (rx_desc->flags & EFX_PKT_TCP) {
335 		m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
336 						       mtod(m, uint8_t *));
337 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
338 	}
339 	m->m_data += sc->rx_prefix_size;
340 	m->m_len = rx_desc->size - sc->rx_prefix_size;
341 	m->m_pkthdr.len = m->m_len;
342 	m->m_pkthdr.csum_flags = csum_flags;
343 	__sfxge_rx_deliver(sc, rx_desc->mbuf);
344 
345 	rx_desc->flags = EFX_DISCARD;
346 	rx_desc->mbuf = NULL;
347 }
348 
349 #ifdef SFXGE_LRO
350 
351 static void
352 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
353 {
354 	struct sfxge_softc *sc = st->sc;
355 	struct mbuf *m = c->mbuf;
356 	struct tcphdr *c_th;
357 	int csum_flags;
358 
359 	KASSERT(m, ("no mbuf to deliver"));
360 
361 	++st->n_bursts;
362 
363 	/* Finish off packet munging and recalculate IP header checksum. */
364 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
365 		struct ip *iph = c->nh;
366 		iph->ip_len = htons(iph->ip_len);
367 		iph->ip_sum = 0;
368 		iph->ip_sum = in_cksum_hdr(iph);
369 		c_th = (struct tcphdr *)(iph + 1);
370 		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
371 			      CSUM_IP_CHECKED | CSUM_IP_VALID);
372 	} else {
373 		struct ip6_hdr *iph = c->nh;
374 		iph->ip6_plen = htons(iph->ip6_plen);
375 		c_th = (struct tcphdr *)(iph + 1);
376 		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
377 	}
378 
379 	c_th->th_win = c->th_last->th_win;
380 	c_th->th_ack = c->th_last->th_ack;
381 	if (c_th->th_off == c->th_last->th_off) {
382 		/* Copy TCP options (take care to avoid going negative). */
383 		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
384 		memcpy(c_th + 1, c->th_last + 1, optlen);
385 	}
386 
387 	m->m_pkthdr.flowid = c->conn_hash;
388 	M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
389 
390 	m->m_pkthdr.csum_flags = csum_flags;
391 	__sfxge_rx_deliver(sc, m);
392 
393 	c->mbuf = NULL;
394 	c->delivered = 1;
395 }
396 
397 /* Drop the given connection, and add it to the free list. */
398 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
399 {
400 	unsigned bucket;
401 
402 	KASSERT(!c->mbuf, ("found orphaned mbuf"));
403 
404 	if (c->next_buf.mbuf != NULL) {
405 		sfxge_rx_deliver(rxq->sc, &c->next_buf);
406 		LIST_REMOVE(c, active_link);
407 	}
408 
409 	bucket = c->conn_hash & rxq->lro.conns_mask;
410 	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
411 	--rxq->lro.conns_n[bucket];
412 	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
413 	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
414 }
415 
416 /* Stop tracking connections that have gone idle in order to keep hash
417  * chains short.
418  */
419 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
420 {
421 	struct sfxge_lro_conn *c;
422 	unsigned i;
423 
424 	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
425 		("found active connections"));
426 
427 	rxq->lro.last_purge_ticks = now;
428 	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
429 		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
430 			continue;
431 
432 		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
433 		if (now - c->last_pkt_ticks > lro_idle_ticks) {
434 			++rxq->lro.n_drop_idle;
435 			sfxge_lro_drop(rxq, c);
436 		}
437 	}
438 }
439 
440 static void
441 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
442 		struct mbuf *mbuf, struct tcphdr *th)
443 {
444 	struct tcphdr *c_th;
445 
446 	/* Tack the new mbuf onto the chain. */
447 	KASSERT(!mbuf->m_next, ("mbuf already chained"));
448 	c->mbuf_tail->m_next = mbuf;
449 	c->mbuf_tail = mbuf;
450 
451 	/* Increase length appropriately */
452 	c->mbuf->m_pkthdr.len += mbuf->m_len;
453 
454 	/* Update the connection state flags */
455 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
456 		struct ip *iph = c->nh;
457 		iph->ip_len += mbuf->m_len;
458 		c_th = (struct tcphdr *)(iph + 1);
459 	} else {
460 		struct ip6_hdr *iph = c->nh;
461 		iph->ip6_plen += mbuf->m_len;
462 		c_th = (struct tcphdr *)(iph + 1);
463 	}
464 	c_th->th_flags |= (th->th_flags & TH_PUSH);
465 	c->th_last = th;
466 	++st->n_merges;
467 
468 	/* Pass packet up now if another segment could overflow the IP
469 	 * length.
470 	 */
471 	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
472 		sfxge_lro_deliver(st, c);
473 }
474 
475 static void
476 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
477 		struct mbuf *mbuf, void *nh, struct tcphdr *th)
478 {
479 	/* Start the chain */
480 	c->mbuf = mbuf;
481 	c->mbuf_tail = c->mbuf;
482 	c->nh = nh;
483 	c->th_last = th;
484 
485 	mbuf->m_pkthdr.len = mbuf->m_len;
486 
487 	/* Mangle header fields for later processing */
488 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
489 		struct ip *iph = nh;
490 		iph->ip_len = ntohs(iph->ip_len);
491 	} else {
492 		struct ip6_hdr *iph = nh;
493 		iph->ip6_plen = ntohs(iph->ip6_plen);
494 	}
495 }
496 
497 /* Try to merge or otherwise hold or deliver (as appropriate) the
498  * packet buffered for this connection (c->next_buf).  Return a flag
499  * indicating whether the connection is still active for LRO purposes.
500  */
501 static int
502 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
503 {
504 	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
505 	char *eh = c->next_eh;
506 	int data_length, hdr_length, dont_merge;
507 	unsigned th_seq, pkt_length;
508 	struct tcphdr *th;
509 	unsigned now;
510 
511 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
512 		struct ip *iph = c->next_nh;
513 		th = (struct tcphdr *)(iph + 1);
514 		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
515 	} else {
516 		struct ip6_hdr *iph = c->next_nh;
517 		th = (struct tcphdr *)(iph + 1);
518 		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
519 	}
520 
521 	hdr_length = (char *) th + th->th_off * 4 - eh;
522 	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
523 		       hdr_length);
524 	th_seq = ntohl(th->th_seq);
525 	dont_merge = ((data_length <= 0)
526 		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
527 
528 	/* Check for options other than aligned timestamp. */
529 	if (th->th_off != 5) {
530 		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
531 		if (th->th_off == 8 &&
532 		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
533 					(TCPOPT_NOP << 16) |
534 					(TCPOPT_TIMESTAMP << 8) |
535 					TCPOLEN_TIMESTAMP)) {
536 			/* timestamp option -- okay */
537 		} else {
538 			dont_merge = 1;
539 		}
540 	}
541 
542 	if (__predict_false(th_seq != c->next_seq)) {
543 		/* Out-of-order, so start counting again. */
544 		if (c->mbuf != NULL)
545 			sfxge_lro_deliver(&rxq->lro, c);
546 		c->n_in_order_pkts -= lro_loss_packets;
547 		c->next_seq = th_seq + data_length;
548 		++rxq->lro.n_misorder;
549 		goto deliver_buf_out;
550 	}
551 	c->next_seq = th_seq + data_length;
552 
553 	now = ticks;
554 	if (now - c->last_pkt_ticks > lro_idle_ticks) {
555 		++rxq->lro.n_drop_idle;
556 		if (c->mbuf != NULL)
557 			sfxge_lro_deliver(&rxq->lro, c);
558 		sfxge_lro_drop(rxq, c);
559 		return (0);
560 	}
561 	c->last_pkt_ticks = ticks;
562 
563 	if (c->n_in_order_pkts < lro_slow_start_packets) {
564 		/* May be in slow-start, so don't merge. */
565 		++rxq->lro.n_slow_start;
566 		++c->n_in_order_pkts;
567 		goto deliver_buf_out;
568 	}
569 
570 	if (__predict_false(dont_merge)) {
571 		if (c->mbuf != NULL)
572 			sfxge_lro_deliver(&rxq->lro, c);
573 		if (th->th_flags & (TH_FIN | TH_RST)) {
574 			++rxq->lro.n_drop_closed;
575 			sfxge_lro_drop(rxq, c);
576 			return (0);
577 		}
578 		goto deliver_buf_out;
579 	}
580 
581 	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
582 
583 	if (__predict_true(c->mbuf != NULL)) {
584 		/* Remove headers and any padding */
585 		rx_buf->mbuf->m_data += hdr_length;
586 		rx_buf->mbuf->m_len = data_length;
587 
588 		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
589 	} else {
590 		/* Remove any padding */
591 		rx_buf->mbuf->m_len = pkt_length;
592 
593 		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
594 	}
595 
596 	rx_buf->mbuf = NULL;
597 	return (1);
598 
599  deliver_buf_out:
600 	sfxge_rx_deliver(rxq->sc, rx_buf);
601 	return (1);
602 }
603 
604 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
605 			       uint16_t l2_id, void *nh, struct tcphdr *th)
606 {
607 	unsigned bucket = conn_hash & st->conns_mask;
608 	struct sfxge_lro_conn *c;
609 
610 	if (st->conns_n[bucket] >= lro_chain_max) {
611 		++st->n_too_many;
612 		return;
613 	}
614 
615 	if (!TAILQ_EMPTY(&st->free_conns)) {
616 		c = TAILQ_FIRST(&st->free_conns);
617 		TAILQ_REMOVE(&st->free_conns, c, link);
618 	} else {
619 		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
620 		if (c == NULL)
621 			return;
622 		c->mbuf = NULL;
623 		c->next_buf.mbuf = NULL;
624 	}
625 
626 	/* Create the connection tracking data */
627 	++st->conns_n[bucket];
628 	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
629 	c->l2_id = l2_id;
630 	c->conn_hash = conn_hash;
631 	c->source = th->th_sport;
632 	c->dest = th->th_dport;
633 	c->n_in_order_pkts = 0;
634 	c->last_pkt_ticks = *(volatile int *)&ticks;
635 	c->delivered = 0;
636 	++st->n_new_stream;
637 	/* NB. We don't initialise c->next_seq, and it doesn't matter what
638 	 * value it has.  Most likely the next packet received for this
639 	 * connection will not match -- no harm done.
640 	 */
641 }
642 
643 /* Process mbuf and decide whether to dispatch it to the stack now or
644  * later.
645  */
646 static void
647 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
648 {
649 	struct sfxge_softc *sc = rxq->sc;
650 	struct mbuf *m = rx_buf->mbuf;
651 	struct ether_header *eh;
652 	struct sfxge_lro_conn *c;
653 	uint16_t l2_id;
654 	uint16_t l3_proto;
655 	void *nh;
656 	struct tcphdr *th;
657 	uint32_t conn_hash;
658 	unsigned bucket;
659 
660 	/* Get the hardware hash */
661 	conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
662 				      mtod(m, uint8_t *));
663 
664 	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
665 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
666 		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
667 		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
668 			SFXGE_LRO_L2_ID_VLAN;
669 		l3_proto = veh->evl_proto;
670 		nh = veh + 1;
671 	} else {
672 		l2_id = 0;
673 		l3_proto = eh->ether_type;
674 		nh = eh + 1;
675 	}
676 
677 	/* Check whether this is a suitable packet (unfragmented
678 	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
679 	 * length, and compute a hash if necessary.  If not, return.
680 	 */
681 	if (l3_proto == htons(ETHERTYPE_IP)) {
682 		struct ip *iph = nh;
683 		if ((iph->ip_p - IPPROTO_TCP) |
684 		    (iph->ip_hl - (sizeof(*iph) >> 2u)) |
685 		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
686 			goto deliver_now;
687 		th = (struct tcphdr *)(iph + 1);
688 	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
689 		struct ip6_hdr *iph = nh;
690 		if (iph->ip6_nxt != IPPROTO_TCP)
691 			goto deliver_now;
692 		l2_id |= SFXGE_LRO_L2_ID_IPV6;
693 		th = (struct tcphdr *)(iph + 1);
694 	} else {
695 		goto deliver_now;
696 	}
697 
698 	bucket = conn_hash & rxq->lro.conns_mask;
699 
700 	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
701 		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
702 			continue;
703 		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
704 			continue;
705 		if (c->mbuf != NULL) {
706 			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
707 				struct ip *c_iph, *iph = nh;
708 				c_iph = c->nh;
709 				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
710 				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
711 					continue;
712 			} else {
713 				struct ip6_hdr *c_iph, *iph = nh;
714 				c_iph = c->nh;
715 				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
716 				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
717 					continue;
718 			}
719 		}
720 
721 		/* Re-insert at head of list to reduce lookup time. */
722 		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
723 		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
724 
725 		if (c->next_buf.mbuf != NULL) {
726 			if (!sfxge_lro_try_merge(rxq, c))
727 				goto deliver_now;
728 		} else {
729 			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
730 			    active_link);
731 		}
732 		c->next_buf = *rx_buf;
733 		c->next_eh = eh;
734 		c->next_nh = nh;
735 
736 		rx_buf->mbuf = NULL;
737 		rx_buf->flags = EFX_DISCARD;
738 		return;
739 	}
740 
741 	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
742  deliver_now:
743 	sfxge_rx_deliver(sc, rx_buf);
744 }
745 
746 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
747 {
748 	struct sfxge_lro_state *st = &rxq->lro;
749 	struct sfxge_lro_conn *c;
750 	unsigned t;
751 
752 	while (!LIST_EMPTY(&st->active_conns)) {
753 		c = LIST_FIRST(&st->active_conns);
754 		if (!c->delivered && c->mbuf != NULL)
755 			sfxge_lro_deliver(st, c);
756 		if (sfxge_lro_try_merge(rxq, c)) {
757 			if (c->mbuf != NULL)
758 				sfxge_lro_deliver(st, c);
759 			LIST_REMOVE(c, active_link);
760 		}
761 		c->delivered = 0;
762 	}
763 
764 	t = *(volatile int *)&ticks;
765 	if (__predict_false(t != st->last_purge_ticks))
766 		sfxge_lro_purge_idle(rxq, t);
767 }
768 
769 #else	/* !SFXGE_LRO */
770 
771 static void
772 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
773 {
774 }
775 
776 static void
777 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
778 {
779 }
780 
781 #endif	/* SFXGE_LRO */
782 
783 void
784 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
785 {
786 	struct sfxge_softc *sc = rxq->sc;
787 	int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
788 	unsigned int index;
789 	struct sfxge_evq *evq;
790 	unsigned int completed;
791 	unsigned int level;
792 	struct mbuf *m;
793 	struct sfxge_rx_sw_desc *prev = NULL;
794 
795 	index = rxq->index;
796 	evq = sc->evq[index];
797 
798 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
799 
800 	completed = rxq->completed;
801 	while (completed != rxq->pending) {
802 		unsigned int id;
803 		struct sfxge_rx_sw_desc *rx_desc;
804 
805 		id = completed++ & rxq->ptr_mask;
806 		rx_desc = &rxq->queue[id];
807 		m = rx_desc->mbuf;
808 
809 		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
810 			goto discard;
811 
812 		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
813 			goto discard;
814 
815 		prefetch_read_many(mtod(m, caddr_t));
816 
817 		/* Check for loopback packets */
818 		if (!(rx_desc->flags & EFX_PKT_IPV4) &&
819 		    !(rx_desc->flags & EFX_PKT_IPV6)) {
820 			struct ether_header *etherhp;
821 
822 			/*LINTED*/
823 			etherhp = mtod(m, struct ether_header *);
824 
825 			if (etherhp->ether_type ==
826 			    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
827 				EFSYS_PROBE(loopback);
828 
829 				rxq->loopback++;
830 				goto discard;
831 			}
832 		}
833 
834 		/* Pass packet up the stack or into LRO (pipelined) */
835 		if (prev != NULL) {
836 			if (lro_enabled)
837 				sfxge_lro(rxq, prev);
838 			else
839 				sfxge_rx_deliver(sc, prev);
840 		}
841 		prev = rx_desc;
842 		continue;
843 
844 discard:
845 		/* Return the packet to the pool */
846 		m_free(m);
847 		rx_desc->mbuf = NULL;
848 	}
849 	rxq->completed = completed;
850 
851 	level = rxq->added - rxq->completed;
852 
853 	/* Pass last packet up the stack or into LRO */
854 	if (prev != NULL) {
855 		if (lro_enabled)
856 			sfxge_lro(rxq, prev);
857 		else
858 			sfxge_rx_deliver(sc, prev);
859 	}
860 
861 	/*
862 	 * If there are any pending flows and this is the end of the
863 	 * poll then they must be completed.
864 	 */
865 	if (eop)
866 		sfxge_lro_end_of_burst(rxq);
867 
868 	/* Top up the queue if necessary */
869 	if (level < rxq->refill_threshold)
870 		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
871 }
872 
873 static void
874 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
875 {
876 	struct sfxge_rxq *rxq;
877 	struct sfxge_evq *evq;
878 	unsigned int count;
879 
880 	rxq = sc->rxq[index];
881 	evq = sc->evq[index];
882 
883 	SFXGE_EVQ_LOCK(evq);
884 
885 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
886 	    ("rxq not started"));
887 
888 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
889 
890 	callout_stop(&rxq->refill_callout);
891 
892 again:
893 	rxq->flush_state = SFXGE_FLUSH_PENDING;
894 
895 	/* Flush the receive queue */
896 	efx_rx_qflush(rxq->common);
897 
898 	SFXGE_EVQ_UNLOCK(evq);
899 
900 	count = 0;
901 	do {
902 		/* Spin for 100 ms */
903 		DELAY(100000);
904 
905 		if (rxq->flush_state != SFXGE_FLUSH_PENDING)
906 			break;
907 
908 	} while (++count < 20);
909 
910 	SFXGE_EVQ_LOCK(evq);
911 
912 	if (rxq->flush_state == SFXGE_FLUSH_FAILED)
913 		goto again;
914 
915 	rxq->flush_state = SFXGE_FLUSH_DONE;
916 
917 	rxq->pending = rxq->added;
918 	sfxge_rx_qcomplete(rxq, B_TRUE);
919 
920 	KASSERT(rxq->completed == rxq->pending,
921 	    ("rxq->completed != rxq->pending"));
922 
923 	rxq->added = 0;
924 	rxq->pending = 0;
925 	rxq->completed = 0;
926 	rxq->loopback = 0;
927 
928 	/* Destroy the common code receive queue. */
929 	efx_rx_qdestroy(rxq->common);
930 
931 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
932 	    EFX_RXQ_NBUFS(sc->rxq_entries));
933 
934 	SFXGE_EVQ_UNLOCK(evq);
935 }
936 
937 static int
938 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
939 {
940 	struct sfxge_rxq *rxq;
941 	efsys_mem_t *esmp;
942 	struct sfxge_evq *evq;
943 	int rc;
944 
945 	rxq = sc->rxq[index];
946 	esmp = &rxq->mem;
947 	evq = sc->evq[index];
948 
949 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
950 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
951 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
952 	    ("evq->init_state != SFXGE_EVQ_STARTED"));
953 
954 	/* Program the buffer table. */
955 	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
956 	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
957 		return (rc);
958 
959 	/* Create the common code receive queue. */
960 	if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
961 	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
962 	    &rxq->common)) != 0)
963 		goto fail;
964 
965 	SFXGE_EVQ_LOCK(evq);
966 
967 	/* Enable the receive queue. */
968 	efx_rx_qenable(rxq->common);
969 
970 	rxq->init_state = SFXGE_RXQ_STARTED;
971 
972 	/* Try to fill the queue from the pool. */
973 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
974 
975 	SFXGE_EVQ_UNLOCK(evq);
976 
977 	return (0);
978 
979 fail:
980 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
981 	    EFX_RXQ_NBUFS(sc->rxq_entries));
982 	return (rc);
983 }
984 
985 void
986 sfxge_rx_stop(struct sfxge_softc *sc)
987 {
988 	int index;
989 
990 	/* Stop the receive queue(s) */
991 	index = sc->rxq_count;
992 	while (--index >= 0)
993 		sfxge_rx_qstop(sc, index);
994 
995 	sc->rx_prefix_size = 0;
996 	sc->rx_buffer_size = 0;
997 
998 	efx_rx_fini(sc->enp);
999 }
1000 
1001 int
1002 sfxge_rx_start(struct sfxge_softc *sc)
1003 {
1004 	struct sfxge_intr *intr;
1005 	int index;
1006 	int rc;
1007 
1008 	intr = &sc->intr;
1009 
1010 	/* Initialize the common code receive module. */
1011 	if ((rc = efx_rx_init(sc->enp)) != 0)
1012 		return (rc);
1013 
1014 	/* Calculate the receive packet buffer size. */
1015 	sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
1016 	sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
1017 			      sc->rx_prefix_size);
1018 
1019 	/* Select zone for packet buffers */
1020 	if (sc->rx_buffer_size <= MCLBYTES)
1021 		sc->rx_buffer_zone = zone_clust;
1022 	else if (sc->rx_buffer_size <= MJUMPAGESIZE)
1023 		sc->rx_buffer_zone = zone_jumbop;
1024 	else if (sc->rx_buffer_size <= MJUM9BYTES)
1025 		sc->rx_buffer_zone = zone_jumbo9;
1026 	else
1027 		sc->rx_buffer_zone = zone_jumbo16;
1028 
1029 	/*
1030 	 * Set up the scale table.  Enable all hash types and hash insertion.
1031 	 */
1032 	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1033 		sc->rx_indir_table[index] = index % sc->rxq_count;
1034 	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1035 				       SFXGE_RX_SCALE_MAX)) != 0)
1036 		goto fail;
1037 	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1038 	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1039 	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1040 
1041 	if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1042 	    sizeof(toep_key))) != 0)
1043 		goto fail;
1044 
1045 	/* Start the receive queue(s). */
1046 	for (index = 0; index < sc->rxq_count; index++) {
1047 		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1048 			goto fail2;
1049 	}
1050 
1051 	return (0);
1052 
1053 fail2:
1054 	while (--index >= 0)
1055 		sfxge_rx_qstop(sc, index);
1056 
1057 fail:
1058 	efx_rx_fini(sc->enp);
1059 
1060 	return (rc);
1061 }
1062 
1063 #ifdef SFXGE_LRO
1064 
1065 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1066 {
1067 	struct sfxge_lro_state *st = &rxq->lro;
1068 	unsigned i;
1069 
1070 	st->conns_mask = lro_table_size - 1;
1071 	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1072 		("lro_table_size must be a power of 2"));
1073 	st->sc = rxq->sc;
1074 	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1075 			   M_SFXGE, M_WAITOK);
1076 	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1077 			     M_SFXGE, M_WAITOK);
1078 	for (i = 0; i <= st->conns_mask; ++i) {
1079 		TAILQ_INIT(&st->conns[i]);
1080 		st->conns_n[i] = 0;
1081 	}
1082 	LIST_INIT(&st->active_conns);
1083 	TAILQ_INIT(&st->free_conns);
1084 }
1085 
1086 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1087 {
1088 	struct sfxge_lro_state *st = &rxq->lro;
1089 	struct sfxge_lro_conn *c;
1090 	unsigned i;
1091 
1092 	/* Return cleanly if sfxge_lro_init() has not been called. */
1093 	if (st->conns == NULL)
1094 		return;
1095 
1096 	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1097 
1098 	for (i = 0; i <= st->conns_mask; ++i) {
1099 		while (!TAILQ_EMPTY(&st->conns[i])) {
1100 			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1101 			sfxge_lro_drop(rxq, c);
1102 		}
1103 	}
1104 
1105 	while (!TAILQ_EMPTY(&st->free_conns)) {
1106 		c = TAILQ_FIRST(&st->free_conns);
1107 		TAILQ_REMOVE(&st->free_conns, c, link);
1108 		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1109 		free(c, M_SFXGE);
1110 	}
1111 
1112 	free(st->conns_n, M_SFXGE);
1113 	free(st->conns, M_SFXGE);
1114 	st->conns = NULL;
1115 }
1116 
1117 #else
1118 
1119 static void
1120 sfxge_lro_init(struct sfxge_rxq *rxq)
1121 {
1122 }
1123 
1124 static void
1125 sfxge_lro_fini(struct sfxge_rxq *rxq)
1126 {
1127 }
1128 
1129 #endif	/* SFXGE_LRO */
1130 
1131 static void
1132 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1133 {
1134 	struct sfxge_rxq *rxq;
1135 
1136 	rxq = sc->rxq[index];
1137 
1138 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1139 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1140 
1141 	/* Free the context array and the flow table. */
1142 	free(rxq->queue, M_SFXGE);
1143 	sfxge_lro_fini(rxq);
1144 
1145 	/* Release DMA memory. */
1146 	sfxge_dma_free(&rxq->mem);
1147 
1148 	sc->rxq[index] = NULL;
1149 
1150 	free(rxq, M_SFXGE);
1151 }
1152 
1153 static int
1154 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1155 {
1156 	struct sfxge_rxq *rxq;
1157 	struct sfxge_evq *evq;
1158 	efsys_mem_t *esmp;
1159 	int rc;
1160 
1161 	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1162 
1163 	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1164 	rxq->sc = sc;
1165 	rxq->index = index;
1166 	rxq->entries = sc->rxq_entries;
1167 	rxq->ptr_mask = rxq->entries - 1;
1168 	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1169 
1170 	sc->rxq[index] = rxq;
1171 	esmp = &rxq->mem;
1172 
1173 	evq = sc->evq[index];
1174 
1175 	/* Allocate and zero DMA space. */
1176 	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1177 		return (rc);
1178 
1179 	/* Allocate buffer table entries. */
1180 	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1181 				 &rxq->buf_base_id);
1182 
1183 	/* Allocate the context array and the flow table. */
1184 	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1185 	    M_SFXGE, M_WAITOK | M_ZERO);
1186 	sfxge_lro_init(rxq);
1187 
1188 	callout_init(&rxq->refill_callout, B_TRUE);
1189 
1190 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1191 
1192 	return (0);
1193 }
1194 
1195 static const struct {
1196 	const char *name;
1197 	size_t offset;
1198 } sfxge_rx_stats[] = {
1199 #define	SFXGE_RX_STAT(name, member) \
1200 	{ #name, offsetof(struct sfxge_rxq, member) }
1201 #ifdef SFXGE_LRO
1202 	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1203 	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1204 	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1205 	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1206 	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1207 	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1208 	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1209 	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1210 #endif
1211 };
1212 
1213 static int
1214 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1215 {
1216 	struct sfxge_softc *sc = arg1;
1217 	unsigned int id = arg2;
1218 	unsigned int sum, index;
1219 
1220 	/* Sum across all RX queues */
1221 	sum = 0;
1222 	for (index = 0; index < sc->rxq_count; index++)
1223 		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1224 					 sfxge_rx_stats[id].offset);
1225 
1226 	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1227 }
1228 
1229 static void
1230 sfxge_rx_stat_init(struct sfxge_softc *sc)
1231 {
1232 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1233 	struct sysctl_oid_list *stat_list;
1234 	unsigned int id;
1235 
1236 	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1237 
1238 	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1239 		SYSCTL_ADD_PROC(
1240 			ctx, stat_list,
1241 			OID_AUTO, sfxge_rx_stats[id].name,
1242 			CTLTYPE_UINT|CTLFLAG_RD,
1243 			sc, id, sfxge_rx_stat_handler, "IU",
1244 			"");
1245 	}
1246 }
1247 
1248 void
1249 sfxge_rx_fini(struct sfxge_softc *sc)
1250 {
1251 	int index;
1252 
1253 	index = sc->rxq_count;
1254 	while (--index >= 0)
1255 		sfxge_rx_qfini(sc, index);
1256 
1257 	sc->rxq_count = 0;
1258 }
1259 
1260 int
1261 sfxge_rx_init(struct sfxge_softc *sc)
1262 {
1263 	struct sfxge_intr *intr;
1264 	int index;
1265 	int rc;
1266 
1267 #ifdef SFXGE_LRO
1268 	if (!ISP2(lro_table_size)) {
1269 		log(LOG_ERR, "%s=%u must be power of 2",
1270 		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1271 		rc = EINVAL;
1272 		goto fail_lro_table_size;
1273 	}
1274 
1275 	if (lro_idle_ticks == 0)
1276 		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1277 #endif
1278 
1279 	intr = &sc->intr;
1280 
1281 	sc->rxq_count = intr->n_alloc;
1282 
1283 	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1284 	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1285 
1286 	/* Initialize the receive queue(s) - one per interrupt. */
1287 	for (index = 0; index < sc->rxq_count; index++) {
1288 		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1289 			goto fail;
1290 	}
1291 
1292 	sfxge_rx_stat_init(sc);
1293 
1294 	return (0);
1295 
1296 fail:
1297 	/* Tear down the receive queue(s). */
1298 	while (--index >= 0)
1299 		sfxge_rx_qfini(sc, index);
1300 
1301 	sc->rxq_count = 0;
1302 
1303 #ifdef SFXGE_LRO
1304 fail_lro_table_size:
1305 #endif
1306 	return (rc);
1307 }
1308