xref: /freebsd/sys/dev/sfxge/sfxge_rx.c (revision 46c1105fbb6fbff6d6ccd0a18571342eb992d637)
1 /*-
2  * Copyright (c) 2010-2015 Solarflare Communications Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright notice,
12  *    this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  *    this list of conditions and the following disclaimer in the documentation
15  *    and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  * The views and conclusions contained in the software and documentation are
30  * those of the authors and should not be interpreted as representing official
31  * policies, either expressed or implied, of the FreeBSD Project.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/param.h>
38 #include <sys/malloc.h>
39 #include <sys/mbuf.h>
40 #include <sys/smp.h>
41 #include <sys/socket.h>
42 #include <sys/sysctl.h>
43 #include <sys/syslog.h>
44 #include <sys/limits.h>
45 #include <sys/syslog.h>
46 
47 #include <net/ethernet.h>
48 #include <net/if.h>
49 #include <net/if_vlan_var.h>
50 
51 #include <netinet/in.h>
52 #include <netinet/ip.h>
53 #include <netinet/ip6.h>
54 #include <netinet/tcp.h>
55 
56 #include <machine/in_cksum.h>
57 
58 #include "common/efx.h"
59 
60 
61 #include "sfxge.h"
62 #include "sfxge_rx.h"
63 
64 #define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
65 
66 #ifdef SFXGE_LRO
67 
68 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
69 	    "Large receive offload (LRO) parameters");
70 
71 #define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
72 
73 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
74  * means we can accelerate a larger number of streams.
75  */
76 static unsigned lro_table_size = 128;
77 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
78 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
79 	    &lro_table_size, 0,
80 	    "Size of the LRO hash table (must be a power of 2)");
81 
82 /* Maximum length of a hash chain.  If chains get too long then the lookup
83  * time increases and may exceed the benefit of LRO.
84  */
85 static unsigned lro_chain_max = 20;
86 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
87 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
88 	    &lro_chain_max, 0,
89 	    "The maximum length of a hash chain");
90 
91 /* Maximum time (in ticks) that a connection can be idle before it's LRO
92  * state is discarded.
93  */
94 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
95 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
96 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
97 	    &lro_idle_ticks, 0,
98 	    "The maximum time (in ticks) that a connection can be idle "
99 	    "before it's LRO state is discarded");
100 
101 /* Number of packets with payload that must arrive in-order before a
102  * connection is eligible for LRO.  The idea is we should avoid coalescing
103  * segments when the sender is in slow-start because reducing the ACK rate
104  * can damage performance.
105  */
106 static int lro_slow_start_packets = 2000;
107 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
108 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
109 	    &lro_slow_start_packets, 0,
110 	    "Number of packets with payload that must arrive in-order before "
111 	    "a connection is eligible for LRO");
112 
113 /* Number of packets with payload that must arrive in-order following loss
114  * before a connection is eligible for LRO.  The idea is we should avoid
115  * coalescing segments when the sender is recovering from loss, because
116  * reducing the ACK rate can damage performance.
117  */
118 static int lro_loss_packets = 20;
119 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
120 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
121 	    &lro_loss_packets, 0,
122 	    "Number of packets with payload that must arrive in-order "
123 	    "following loss before a connection is eligible for LRO");
124 
125 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
126 #define	SFXGE_LRO_L2_ID_VLAN 0x4000
127 #define	SFXGE_LRO_L2_ID_IPV6 0x8000
128 #define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
129 #define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
130 
131 /* Compare IPv6 addresses, avoiding conditional branches */
132 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
133 				   const struct in6_addr *right)
134 {
135 #if LONG_BIT == 64
136 	const uint64_t *left64 = (const uint64_t *)left;
137 	const uint64_t *right64 = (const uint64_t *)right;
138 	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
139 #else
140 	return (left->s6_addr32[0] - right->s6_addr32[0]) |
141 	       (left->s6_addr32[1] - right->s6_addr32[1]) |
142 	       (left->s6_addr32[2] - right->s6_addr32[2]) |
143 	       (left->s6_addr32[3] - right->s6_addr32[3]);
144 #endif
145 }
146 
147 #endif	/* SFXGE_LRO */
148 
149 void
150 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
151 {
152 
153 	rxq->flush_state = SFXGE_FLUSH_DONE;
154 }
155 
156 void
157 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
158 {
159 
160 	rxq->flush_state = SFXGE_FLUSH_FAILED;
161 }
162 
163 static uint8_t toep_key[] = {
164 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
165 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
166 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
167 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
168 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
169 };
170 
171 static void
172 sfxge_rx_post_refill(void *arg)
173 {
174 	struct sfxge_rxq *rxq = arg;
175 	struct sfxge_softc *sc;
176 	unsigned int index;
177 	struct sfxge_evq *evq;
178 	uint16_t magic;
179 
180 	sc = rxq->sc;
181 	index = rxq->index;
182 	evq = sc->evq[index];
183 
184 	magic = SFXGE_MAGIC_RX_QREFILL | index;
185 
186 	/* This is guaranteed due to the start/stop order of rx and ev */
187 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
188 	    ("evq not started"));
189 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
190 	    ("rxq not started"));
191 	efx_ev_qpost(evq->common, magic);
192 }
193 
194 static void
195 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
196 {
197 	/* Initially retry after 100 ms, but back off in case of
198 	 * repeated failures as we probably have to wait for the
199 	 * administrator to raise the pool limit. */
200 	if (retrying)
201 		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
202 	else
203 		rxq->refill_delay = hz / 10;
204 
205 	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
206 			     sfxge_rx_post_refill, rxq);
207 }
208 
209 #define	SFXGE_REFILL_BATCH  64
210 
211 static void
212 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
213 {
214 	struct sfxge_softc *sc;
215 	unsigned int index;
216 	struct sfxge_evq *evq;
217 	unsigned int batch;
218 	unsigned int rxfill;
219 	unsigned int mblksize;
220 	int ntodo;
221 	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
222 
223 	sc = rxq->sc;
224 	index = rxq->index;
225 	evq = sc->evq[index];
226 
227 	prefetch_read_many(sc->enp);
228 	prefetch_read_many(rxq->common);
229 
230 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
231 
232 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
233 		return;
234 
235 	rxfill = rxq->added - rxq->completed;
236 	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
237 	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
238 	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
239 	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
240 	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
241 
242 	if (ntodo == 0)
243 		return;
244 
245 	batch = 0;
246 	mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
247 	while (ntodo-- > 0) {
248 		unsigned int id;
249 		struct sfxge_rx_sw_desc *rx_desc;
250 		bus_dma_segment_t seg;
251 		struct mbuf *m;
252 
253 		id = (rxq->added + batch) & rxq->ptr_mask;
254 		rx_desc = &rxq->queue[id];
255 		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
256 
257 		rx_desc->flags = EFX_DISCARD;
258 		m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
259 		    sc->rx_cluster_size);
260 		if (m == NULL)
261 			break;
262 
263 		/* m_len specifies length of area to be mapped for DMA */
264 		m->m_len  = mblksize;
265 		m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
266 		m->m_data += sc->rx_buffer_align;
267 
268 		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
269 		addr[batch++] = seg.ds_addr;
270 
271 		if (batch == SFXGE_REFILL_BATCH) {
272 			efx_rx_qpost(rxq->common, addr, mblksize, batch,
273 			    rxq->completed, rxq->added);
274 			rxq->added += batch;
275 			batch = 0;
276 		}
277 	}
278 
279 	if (ntodo != 0)
280 		sfxge_rx_schedule_refill(rxq, retrying);
281 
282 	if (batch != 0) {
283 		efx_rx_qpost(rxq->common, addr, mblksize, batch,
284 		    rxq->completed, rxq->added);
285 		rxq->added += batch;
286 	}
287 
288 	/* Make the descriptors visible to the hardware */
289 	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
290 			BUS_DMASYNC_PREWRITE);
291 
292 	efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
293 
294 	/* The queue could still be empty if no descriptors were actually
295 	 * pushed, in which case there will be no event to cause the next
296 	 * refill, so we must schedule a refill ourselves.
297 	 */
298 	if(rxq->pushed == rxq->completed) {
299 		sfxge_rx_schedule_refill(rxq, retrying);
300 	}
301 }
302 
303 void
304 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
305 {
306 
307 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
308 		return;
309 
310 	/* Make sure the queue is full */
311 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
312 }
313 
314 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
315 {
316 	struct ifnet *ifp = sc->ifnet;
317 
318 	m->m_pkthdr.rcvif = ifp;
319 	m->m_pkthdr.csum_data = 0xffff;
320 	ifp->if_input(ifp, m);
321 }
322 
323 static void
324 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
325 {
326 	struct mbuf *m = rx_desc->mbuf;
327 	int flags = rx_desc->flags;
328 	int csum_flags;
329 
330 	/* Convert checksum flags */
331 	csum_flags = (flags & EFX_CKSUM_IPV4) ?
332 		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
333 	if (flags & EFX_CKSUM_TCPUDP)
334 		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
335 
336 	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
337 		m->m_pkthdr.flowid =
338 			efx_psuedo_hdr_hash_get(sc->enp,
339 						EFX_RX_HASHALG_TOEPLITZ,
340 						mtod(m, uint8_t *));
341 		/* The hash covers a 4-tuple for TCP only */
342 		M_HASHTYPE_SET(m,
343 		    (flags & EFX_PKT_IPV4) ?
344 			((flags & EFX_PKT_TCP) ?
345 			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
346 			((flags & EFX_PKT_TCP) ?
347 			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
348 	}
349 	m->m_data += sc->rx_prefix_size;
350 	m->m_len = rx_desc->size - sc->rx_prefix_size;
351 	m->m_pkthdr.len = m->m_len;
352 	m->m_pkthdr.csum_flags = csum_flags;
353 	__sfxge_rx_deliver(sc, rx_desc->mbuf);
354 
355 	rx_desc->flags = EFX_DISCARD;
356 	rx_desc->mbuf = NULL;
357 }
358 
359 #ifdef SFXGE_LRO
360 
361 static void
362 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
363 {
364 	struct sfxge_softc *sc = st->sc;
365 	struct mbuf *m = c->mbuf;
366 	struct tcphdr *c_th;
367 	int csum_flags;
368 
369 	KASSERT(m, ("no mbuf to deliver"));
370 
371 	++st->n_bursts;
372 
373 	/* Finish off packet munging and recalculate IP header checksum. */
374 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
375 		struct ip *iph = c->nh;
376 		iph->ip_len = htons(iph->ip_len);
377 		iph->ip_sum = 0;
378 		iph->ip_sum = in_cksum_hdr(iph);
379 		c_th = (struct tcphdr *)(iph + 1);
380 		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
381 			      CSUM_IP_CHECKED | CSUM_IP_VALID);
382 	} else {
383 		struct ip6_hdr *iph = c->nh;
384 		iph->ip6_plen = htons(iph->ip6_plen);
385 		c_th = (struct tcphdr *)(iph + 1);
386 		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
387 	}
388 
389 	c_th->th_win = c->th_last->th_win;
390 	c_th->th_ack = c->th_last->th_ack;
391 	if (c_th->th_off == c->th_last->th_off) {
392 		/* Copy TCP options (take care to avoid going negative). */
393 		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
394 		memcpy(c_th + 1, c->th_last + 1, optlen);
395 	}
396 
397 	m->m_pkthdr.flowid = c->conn_hash;
398 	M_HASHTYPE_SET(m,
399 	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
400 		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
401 
402 	m->m_pkthdr.csum_flags = csum_flags;
403 	__sfxge_rx_deliver(sc, m);
404 
405 	c->mbuf = NULL;
406 	c->delivered = 1;
407 }
408 
409 /* Drop the given connection, and add it to the free list. */
410 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
411 {
412 	unsigned bucket;
413 
414 	KASSERT(!c->mbuf, ("found orphaned mbuf"));
415 
416 	if (c->next_buf.mbuf != NULL) {
417 		sfxge_rx_deliver(rxq->sc, &c->next_buf);
418 		LIST_REMOVE(c, active_link);
419 	}
420 
421 	bucket = c->conn_hash & rxq->lro.conns_mask;
422 	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
423 	--rxq->lro.conns_n[bucket];
424 	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
425 	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
426 }
427 
428 /* Stop tracking connections that have gone idle in order to keep hash
429  * chains short.
430  */
431 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
432 {
433 	struct sfxge_lro_conn *c;
434 	unsigned i;
435 
436 	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
437 		("found active connections"));
438 
439 	rxq->lro.last_purge_ticks = now;
440 	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
441 		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
442 			continue;
443 
444 		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
445 		if (now - c->last_pkt_ticks > lro_idle_ticks) {
446 			++rxq->lro.n_drop_idle;
447 			sfxge_lro_drop(rxq, c);
448 		}
449 	}
450 }
451 
452 static void
453 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
454 		struct mbuf *mbuf, struct tcphdr *th)
455 {
456 	struct tcphdr *c_th;
457 
458 	/* Tack the new mbuf onto the chain. */
459 	KASSERT(!mbuf->m_next, ("mbuf already chained"));
460 	c->mbuf_tail->m_next = mbuf;
461 	c->mbuf_tail = mbuf;
462 
463 	/* Increase length appropriately */
464 	c->mbuf->m_pkthdr.len += mbuf->m_len;
465 
466 	/* Update the connection state flags */
467 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
468 		struct ip *iph = c->nh;
469 		iph->ip_len += mbuf->m_len;
470 		c_th = (struct tcphdr *)(iph + 1);
471 	} else {
472 		struct ip6_hdr *iph = c->nh;
473 		iph->ip6_plen += mbuf->m_len;
474 		c_th = (struct tcphdr *)(iph + 1);
475 	}
476 	c_th->th_flags |= (th->th_flags & TH_PUSH);
477 	c->th_last = th;
478 	++st->n_merges;
479 
480 	/* Pass packet up now if another segment could overflow the IP
481 	 * length.
482 	 */
483 	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
484 		sfxge_lro_deliver(st, c);
485 }
486 
487 static void
488 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
489 		struct mbuf *mbuf, void *nh, struct tcphdr *th)
490 {
491 	/* Start the chain */
492 	c->mbuf = mbuf;
493 	c->mbuf_tail = c->mbuf;
494 	c->nh = nh;
495 	c->th_last = th;
496 
497 	mbuf->m_pkthdr.len = mbuf->m_len;
498 
499 	/* Mangle header fields for later processing */
500 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
501 		struct ip *iph = nh;
502 		iph->ip_len = ntohs(iph->ip_len);
503 	} else {
504 		struct ip6_hdr *iph = nh;
505 		iph->ip6_plen = ntohs(iph->ip6_plen);
506 	}
507 }
508 
509 /* Try to merge or otherwise hold or deliver (as appropriate) the
510  * packet buffered for this connection (c->next_buf).  Return a flag
511  * indicating whether the connection is still active for LRO purposes.
512  */
513 static int
514 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
515 {
516 	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
517 	char *eh = c->next_eh;
518 	int data_length, hdr_length, dont_merge;
519 	unsigned th_seq, pkt_length;
520 	struct tcphdr *th;
521 	unsigned now;
522 
523 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
524 		struct ip *iph = c->next_nh;
525 		th = (struct tcphdr *)(iph + 1);
526 		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
527 	} else {
528 		struct ip6_hdr *iph = c->next_nh;
529 		th = (struct tcphdr *)(iph + 1);
530 		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
531 	}
532 
533 	hdr_length = (char *) th + th->th_off * 4 - eh;
534 	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
535 		       hdr_length);
536 	th_seq = ntohl(th->th_seq);
537 	dont_merge = ((data_length <= 0)
538 		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
539 
540 	/* Check for options other than aligned timestamp. */
541 	if (th->th_off != 5) {
542 		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
543 		if (th->th_off == 8 &&
544 		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
545 					(TCPOPT_NOP << 16) |
546 					(TCPOPT_TIMESTAMP << 8) |
547 					TCPOLEN_TIMESTAMP)) {
548 			/* timestamp option -- okay */
549 		} else {
550 			dont_merge = 1;
551 		}
552 	}
553 
554 	if (__predict_false(th_seq != c->next_seq)) {
555 		/* Out-of-order, so start counting again. */
556 		if (c->mbuf != NULL)
557 			sfxge_lro_deliver(&rxq->lro, c);
558 		c->n_in_order_pkts -= lro_loss_packets;
559 		c->next_seq = th_seq + data_length;
560 		++rxq->lro.n_misorder;
561 		goto deliver_buf_out;
562 	}
563 	c->next_seq = th_seq + data_length;
564 
565 	now = ticks;
566 	if (now - c->last_pkt_ticks > lro_idle_ticks) {
567 		++rxq->lro.n_drop_idle;
568 		if (c->mbuf != NULL)
569 			sfxge_lro_deliver(&rxq->lro, c);
570 		sfxge_lro_drop(rxq, c);
571 		return (0);
572 	}
573 	c->last_pkt_ticks = ticks;
574 
575 	if (c->n_in_order_pkts < lro_slow_start_packets) {
576 		/* May be in slow-start, so don't merge. */
577 		++rxq->lro.n_slow_start;
578 		++c->n_in_order_pkts;
579 		goto deliver_buf_out;
580 	}
581 
582 	if (__predict_false(dont_merge)) {
583 		if (c->mbuf != NULL)
584 			sfxge_lro_deliver(&rxq->lro, c);
585 		if (th->th_flags & (TH_FIN | TH_RST)) {
586 			++rxq->lro.n_drop_closed;
587 			sfxge_lro_drop(rxq, c);
588 			return (0);
589 		}
590 		goto deliver_buf_out;
591 	}
592 
593 	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
594 
595 	if (__predict_true(c->mbuf != NULL)) {
596 		/* Remove headers and any padding */
597 		rx_buf->mbuf->m_data += hdr_length;
598 		rx_buf->mbuf->m_len = data_length;
599 
600 		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
601 	} else {
602 		/* Remove any padding */
603 		rx_buf->mbuf->m_len = pkt_length;
604 
605 		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
606 	}
607 
608 	rx_buf->mbuf = NULL;
609 	return (1);
610 
611  deliver_buf_out:
612 	sfxge_rx_deliver(rxq->sc, rx_buf);
613 	return (1);
614 }
615 
616 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
617 			       uint16_t l2_id, void *nh, struct tcphdr *th)
618 {
619 	unsigned bucket = conn_hash & st->conns_mask;
620 	struct sfxge_lro_conn *c;
621 
622 	if (st->conns_n[bucket] >= lro_chain_max) {
623 		++st->n_too_many;
624 		return;
625 	}
626 
627 	if (!TAILQ_EMPTY(&st->free_conns)) {
628 		c = TAILQ_FIRST(&st->free_conns);
629 		TAILQ_REMOVE(&st->free_conns, c, link);
630 	} else {
631 		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
632 		if (c == NULL)
633 			return;
634 		c->mbuf = NULL;
635 		c->next_buf.mbuf = NULL;
636 	}
637 
638 	/* Create the connection tracking data */
639 	++st->conns_n[bucket];
640 	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
641 	c->l2_id = l2_id;
642 	c->conn_hash = conn_hash;
643 	c->source = th->th_sport;
644 	c->dest = th->th_dport;
645 	c->n_in_order_pkts = 0;
646 	c->last_pkt_ticks = *(volatile int *)&ticks;
647 	c->delivered = 0;
648 	++st->n_new_stream;
649 	/* NB. We don't initialise c->next_seq, and it doesn't matter what
650 	 * value it has.  Most likely the next packet received for this
651 	 * connection will not match -- no harm done.
652 	 */
653 }
654 
655 /* Process mbuf and decide whether to dispatch it to the stack now or
656  * later.
657  */
658 static void
659 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
660 {
661 	struct sfxge_softc *sc = rxq->sc;
662 	struct mbuf *m = rx_buf->mbuf;
663 	struct ether_header *eh;
664 	struct sfxge_lro_conn *c;
665 	uint16_t l2_id;
666 	uint16_t l3_proto;
667 	void *nh;
668 	struct tcphdr *th;
669 	uint32_t conn_hash;
670 	unsigned bucket;
671 
672 	/* Get the hardware hash */
673 	conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
674 					    EFX_RX_HASHALG_TOEPLITZ,
675 					    mtod(m, uint8_t *));
676 
677 	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
678 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
679 		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
680 		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
681 			SFXGE_LRO_L2_ID_VLAN;
682 		l3_proto = veh->evl_proto;
683 		nh = veh + 1;
684 	} else {
685 		l2_id = 0;
686 		l3_proto = eh->ether_type;
687 		nh = eh + 1;
688 	}
689 
690 	/* Check whether this is a suitable packet (unfragmented
691 	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
692 	 * length, and compute a hash if necessary.  If not, return.
693 	 */
694 	if (l3_proto == htons(ETHERTYPE_IP)) {
695 		struct ip *iph = nh;
696 
697 		KASSERT(iph->ip_p == IPPROTO_TCP,
698 		    ("IPv4 protocol is not TCP, but packet marker is set"));
699 		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
700 		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
701 			goto deliver_now;
702 		th = (struct tcphdr *)(iph + 1);
703 	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
704 		struct ip6_hdr *iph = nh;
705 
706 		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
707 		    ("IPv6 next header is not TCP, but packet marker is set"));
708 		l2_id |= SFXGE_LRO_L2_ID_IPV6;
709 		th = (struct tcphdr *)(iph + 1);
710 	} else {
711 		goto deliver_now;
712 	}
713 
714 	bucket = conn_hash & rxq->lro.conns_mask;
715 
716 	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
717 		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
718 			continue;
719 		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
720 			continue;
721 		if (c->mbuf != NULL) {
722 			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
723 				struct ip *c_iph, *iph = nh;
724 				c_iph = c->nh;
725 				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
726 				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
727 					continue;
728 			} else {
729 				struct ip6_hdr *c_iph, *iph = nh;
730 				c_iph = c->nh;
731 				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
732 				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
733 					continue;
734 			}
735 		}
736 
737 		/* Re-insert at head of list to reduce lookup time. */
738 		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
739 		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
740 
741 		if (c->next_buf.mbuf != NULL) {
742 			if (!sfxge_lro_try_merge(rxq, c))
743 				goto deliver_now;
744 		} else {
745 			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
746 			    active_link);
747 		}
748 		c->next_buf = *rx_buf;
749 		c->next_eh = eh;
750 		c->next_nh = nh;
751 
752 		rx_buf->mbuf = NULL;
753 		rx_buf->flags = EFX_DISCARD;
754 		return;
755 	}
756 
757 	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
758  deliver_now:
759 	sfxge_rx_deliver(sc, rx_buf);
760 }
761 
762 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
763 {
764 	struct sfxge_lro_state *st = &rxq->lro;
765 	struct sfxge_lro_conn *c;
766 	unsigned t;
767 
768 	while (!LIST_EMPTY(&st->active_conns)) {
769 		c = LIST_FIRST(&st->active_conns);
770 		if (!c->delivered && c->mbuf != NULL)
771 			sfxge_lro_deliver(st, c);
772 		if (sfxge_lro_try_merge(rxq, c)) {
773 			if (c->mbuf != NULL)
774 				sfxge_lro_deliver(st, c);
775 			LIST_REMOVE(c, active_link);
776 		}
777 		c->delivered = 0;
778 	}
779 
780 	t = *(volatile int *)&ticks;
781 	if (__predict_false(t != st->last_purge_ticks))
782 		sfxge_lro_purge_idle(rxq, t);
783 }
784 
785 #else	/* !SFXGE_LRO */
786 
787 static void
788 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
789 {
790 }
791 
792 static void
793 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
794 {
795 }
796 
797 #endif	/* SFXGE_LRO */
798 
799 void
800 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
801 {
802 	struct sfxge_softc *sc = rxq->sc;
803 	int if_capenable = sc->ifnet->if_capenable;
804 	int lro_enabled = if_capenable & IFCAP_LRO;
805 	unsigned int index;
806 	struct sfxge_evq *evq;
807 	unsigned int completed;
808 	unsigned int level;
809 	struct mbuf *m;
810 	struct sfxge_rx_sw_desc *prev = NULL;
811 
812 	index = rxq->index;
813 	evq = sc->evq[index];
814 
815 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
816 
817 	completed = rxq->completed;
818 	while (completed != rxq->pending) {
819 		unsigned int id;
820 		struct sfxge_rx_sw_desc *rx_desc;
821 
822 		id = completed++ & rxq->ptr_mask;
823 		rx_desc = &rxq->queue[id];
824 		m = rx_desc->mbuf;
825 
826 		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
827 			goto discard;
828 
829 		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
830 			goto discard;
831 
832 		/* Read the length from the pseudo header if required */
833 		if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
834 			uint16_t tmp_size;
835 			int rc;
836 			rc = efx_psuedo_hdr_pkt_length_get(sc->enp,
837 							   mtod(m, uint8_t *),
838 							   &tmp_size);
839 			KASSERT(rc == 0, ("cannot get packet length: %d", rc));
840 			rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
841 		}
842 
843 		prefetch_read_many(mtod(m, caddr_t));
844 
845 		switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
846 		case EFX_PKT_IPV4:
847 			if (~if_capenable & IFCAP_RXCSUM)
848 				rx_desc->flags &=
849 				    ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
850 			break;
851 		case EFX_PKT_IPV6:
852 			if (~if_capenable & IFCAP_RXCSUM_IPV6)
853 				rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
854 			break;
855 		case 0:
856 			/* Check for loopback packets */
857 			{
858 				struct ether_header *etherhp;
859 
860 				/*LINTED*/
861 				etherhp = mtod(m, struct ether_header *);
862 
863 				if (etherhp->ether_type ==
864 				    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
865 					EFSYS_PROBE(loopback);
866 
867 					rxq->loopback++;
868 					goto discard;
869 				}
870 			}
871 			break;
872 		default:
873 			KASSERT(B_FALSE,
874 			    ("Rx descriptor with both IPv4 and IPv6 flags"));
875 			goto discard;
876 		}
877 
878 		/* Pass packet up the stack or into LRO (pipelined) */
879 		if (prev != NULL) {
880 			if (lro_enabled &&
881 			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
882 			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
883 				sfxge_lro(rxq, prev);
884 			else
885 				sfxge_rx_deliver(sc, prev);
886 		}
887 		prev = rx_desc;
888 		continue;
889 
890 discard:
891 		/* Return the packet to the pool */
892 		m_free(m);
893 		rx_desc->mbuf = NULL;
894 	}
895 	rxq->completed = completed;
896 
897 	level = rxq->added - rxq->completed;
898 
899 	/* Pass last packet up the stack or into LRO */
900 	if (prev != NULL) {
901 		if (lro_enabled &&
902 		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
903 		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
904 			sfxge_lro(rxq, prev);
905 		else
906 			sfxge_rx_deliver(sc, prev);
907 	}
908 
909 	/*
910 	 * If there are any pending flows and this is the end of the
911 	 * poll then they must be completed.
912 	 */
913 	if (eop)
914 		sfxge_lro_end_of_burst(rxq);
915 
916 	/* Top up the queue if necessary */
917 	if (level < rxq->refill_threshold)
918 		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
919 }
920 
921 static void
922 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
923 {
924 	struct sfxge_rxq *rxq;
925 	struct sfxge_evq *evq;
926 	unsigned int count;
927 	unsigned int retry = 3;
928 
929 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
930 
931 	rxq = sc->rxq[index];
932 	evq = sc->evq[index];
933 
934 	SFXGE_EVQ_LOCK(evq);
935 
936 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
937 	    ("rxq not started"));
938 
939 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
940 
941 	callout_stop(&rxq->refill_callout);
942 
943 	while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
944 		rxq->flush_state = SFXGE_FLUSH_PENDING;
945 
946 		SFXGE_EVQ_UNLOCK(evq);
947 
948 		/* Flush the receive queue */
949 		if (efx_rx_qflush(rxq->common) != 0) {
950 			SFXGE_EVQ_LOCK(evq);
951 			rxq->flush_state = SFXGE_FLUSH_FAILED;
952 			break;
953 		}
954 
955 		count = 0;
956 		do {
957 			/* Spin for 100 ms */
958 			DELAY(100000);
959 
960 			if (rxq->flush_state != SFXGE_FLUSH_PENDING)
961 				break;
962 
963 		} while (++count < 20);
964 
965 		SFXGE_EVQ_LOCK(evq);
966 
967 		if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
968 			/* Flush timeout - neither done nor failed */
969 			log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
970 			    device_get_nameunit(sc->dev), index);
971 			rxq->flush_state = SFXGE_FLUSH_DONE;
972 		}
973 		retry--;
974 	}
975 	if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
976 		log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
977 		    device_get_nameunit(sc->dev), index);
978 		rxq->flush_state = SFXGE_FLUSH_DONE;
979 	}
980 
981 	rxq->pending = rxq->added;
982 	sfxge_rx_qcomplete(rxq, B_TRUE);
983 
984 	KASSERT(rxq->completed == rxq->pending,
985 	    ("rxq->completed != rxq->pending"));
986 
987 	rxq->added = 0;
988 	rxq->pushed = 0;
989 	rxq->pending = 0;
990 	rxq->completed = 0;
991 	rxq->loopback = 0;
992 
993 	/* Destroy the common code receive queue. */
994 	efx_rx_qdestroy(rxq->common);
995 
996 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
997 	    EFX_RXQ_NBUFS(sc->rxq_entries));
998 
999 	SFXGE_EVQ_UNLOCK(evq);
1000 }
1001 
1002 static int
1003 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1004 {
1005 	struct sfxge_rxq *rxq;
1006 	efsys_mem_t *esmp;
1007 	struct sfxge_evq *evq;
1008 	int rc;
1009 
1010 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1011 
1012 	rxq = sc->rxq[index];
1013 	esmp = &rxq->mem;
1014 	evq = sc->evq[index];
1015 
1016 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1017 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1018 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1019 	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1020 
1021 	/* Program the buffer table. */
1022 	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1023 	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1024 		return (rc);
1025 
1026 	/* Create the common code receive queue. */
1027 	if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1028 	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1029 	    &rxq->common)) != 0)
1030 		goto fail;
1031 
1032 	SFXGE_EVQ_LOCK(evq);
1033 
1034 	/* Enable the receive queue. */
1035 	efx_rx_qenable(rxq->common);
1036 
1037 	rxq->init_state = SFXGE_RXQ_STARTED;
1038 	rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1039 
1040 	/* Try to fill the queue from the pool. */
1041 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1042 
1043 	SFXGE_EVQ_UNLOCK(evq);
1044 
1045 	return (0);
1046 
1047 fail:
1048 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1049 	    EFX_RXQ_NBUFS(sc->rxq_entries));
1050 	return (rc);
1051 }
1052 
1053 void
1054 sfxge_rx_stop(struct sfxge_softc *sc)
1055 {
1056 	int index;
1057 
1058 	efx_mac_filter_default_rxq_clear(sc->enp);
1059 
1060 	/* Stop the receive queue(s) */
1061 	index = sc->rxq_count;
1062 	while (--index >= 0)
1063 		sfxge_rx_qstop(sc, index);
1064 
1065 	sc->rx_prefix_size = 0;
1066 	sc->rx_buffer_size = 0;
1067 
1068 	efx_rx_fini(sc->enp);
1069 }
1070 
1071 int
1072 sfxge_rx_start(struct sfxge_softc *sc)
1073 {
1074 	struct sfxge_intr *intr;
1075 	const efx_nic_cfg_t *encp;
1076 	size_t hdrlen, align, reserved;
1077 	int index;
1078 	int rc;
1079 
1080 	intr = &sc->intr;
1081 
1082 	/* Initialize the common code receive module. */
1083 	if ((rc = efx_rx_init(sc->enp)) != 0)
1084 		return (rc);
1085 
1086 	encp = efx_nic_cfg_get(sc->enp);
1087 	sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1088 
1089 	/* Calculate the receive packet buffer size. */
1090 	sc->rx_prefix_size = encp->enc_rx_prefix_size;
1091 
1092 	/* Ensure IP headers are 32bit aligned */
1093 	hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1094 	sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1095 
1096 	sc->rx_buffer_size += sc->rx_buffer_align;
1097 
1098 	/* Align end of packet buffer for RX DMA end padding */
1099 	align = MAX(1, encp->enc_rx_buf_align_end);
1100 	EFSYS_ASSERT(ISP2(align));
1101 	sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1102 
1103 	/*
1104 	 * Standard mbuf zones only guarantee pointer-size alignment;
1105 	 * we need extra space to align to the cache line
1106 	 */
1107 	reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1108 
1109 	/* Select zone for packet buffers */
1110 	if (reserved <= MCLBYTES)
1111 		sc->rx_cluster_size = MCLBYTES;
1112 	else if (reserved <= MJUMPAGESIZE)
1113 		sc->rx_cluster_size = MJUMPAGESIZE;
1114 	else if (reserved <= MJUM9BYTES)
1115 		sc->rx_cluster_size = MJUM9BYTES;
1116 	else
1117 		sc->rx_cluster_size = MJUM16BYTES;
1118 
1119 	/*
1120 	 * Set up the scale table.  Enable all hash types and hash insertion.
1121 	 */
1122 	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1123 		sc->rx_indir_table[index] = index % sc->rxq_count;
1124 	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1125 				       SFXGE_RX_SCALE_MAX)) != 0)
1126 		goto fail;
1127 	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1128 	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1129 	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1130 
1131 	if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1132 				       sizeof(toep_key))) != 0)
1133 		goto fail;
1134 
1135 	/* Start the receive queue(s). */
1136 	for (index = 0; index < sc->rxq_count; index++) {
1137 		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1138 			goto fail2;
1139 	}
1140 
1141 	rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1142 					    sc->intr.n_alloc > 1);
1143 	if (rc != 0)
1144 		goto fail3;
1145 
1146 	return (0);
1147 
1148 fail3:
1149 fail2:
1150 	while (--index >= 0)
1151 		sfxge_rx_qstop(sc, index);
1152 
1153 fail:
1154 	efx_rx_fini(sc->enp);
1155 
1156 	return (rc);
1157 }
1158 
1159 #ifdef SFXGE_LRO
1160 
1161 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1162 {
1163 	struct sfxge_lro_state *st = &rxq->lro;
1164 	unsigned i;
1165 
1166 	st->conns_mask = lro_table_size - 1;
1167 	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1168 		("lro_table_size must be a power of 2"));
1169 	st->sc = rxq->sc;
1170 	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1171 			   M_SFXGE, M_WAITOK);
1172 	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1173 			     M_SFXGE, M_WAITOK);
1174 	for (i = 0; i <= st->conns_mask; ++i) {
1175 		TAILQ_INIT(&st->conns[i]);
1176 		st->conns_n[i] = 0;
1177 	}
1178 	LIST_INIT(&st->active_conns);
1179 	TAILQ_INIT(&st->free_conns);
1180 }
1181 
1182 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1183 {
1184 	struct sfxge_lro_state *st = &rxq->lro;
1185 	struct sfxge_lro_conn *c;
1186 	unsigned i;
1187 
1188 	/* Return cleanly if sfxge_lro_init() has not been called. */
1189 	if (st->conns == NULL)
1190 		return;
1191 
1192 	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1193 
1194 	for (i = 0; i <= st->conns_mask; ++i) {
1195 		while (!TAILQ_EMPTY(&st->conns[i])) {
1196 			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1197 			sfxge_lro_drop(rxq, c);
1198 		}
1199 	}
1200 
1201 	while (!TAILQ_EMPTY(&st->free_conns)) {
1202 		c = TAILQ_FIRST(&st->free_conns);
1203 		TAILQ_REMOVE(&st->free_conns, c, link);
1204 		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1205 		free(c, M_SFXGE);
1206 	}
1207 
1208 	free(st->conns_n, M_SFXGE);
1209 	free(st->conns, M_SFXGE);
1210 	st->conns = NULL;
1211 }
1212 
1213 #else
1214 
1215 static void
1216 sfxge_lro_init(struct sfxge_rxq *rxq)
1217 {
1218 }
1219 
1220 static void
1221 sfxge_lro_fini(struct sfxge_rxq *rxq)
1222 {
1223 }
1224 
1225 #endif	/* SFXGE_LRO */
1226 
1227 static void
1228 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1229 {
1230 	struct sfxge_rxq *rxq;
1231 
1232 	rxq = sc->rxq[index];
1233 
1234 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1235 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1236 
1237 	/* Free the context array and the flow table. */
1238 	free(rxq->queue, M_SFXGE);
1239 	sfxge_lro_fini(rxq);
1240 
1241 	/* Release DMA memory. */
1242 	sfxge_dma_free(&rxq->mem);
1243 
1244 	sc->rxq[index] = NULL;
1245 
1246 	free(rxq, M_SFXGE);
1247 }
1248 
1249 static int
1250 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1251 {
1252 	struct sfxge_rxq *rxq;
1253 	struct sfxge_evq *evq;
1254 	efsys_mem_t *esmp;
1255 	int rc;
1256 
1257 	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1258 
1259 	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1260 	rxq->sc = sc;
1261 	rxq->index = index;
1262 	rxq->entries = sc->rxq_entries;
1263 	rxq->ptr_mask = rxq->entries - 1;
1264 	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1265 
1266 	sc->rxq[index] = rxq;
1267 	esmp = &rxq->mem;
1268 
1269 	evq = sc->evq[index];
1270 
1271 	/* Allocate and zero DMA space. */
1272 	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1273 		return (rc);
1274 
1275 	/* Allocate buffer table entries. */
1276 	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1277 				 &rxq->buf_base_id);
1278 
1279 	/* Allocate the context array and the flow table. */
1280 	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1281 	    M_SFXGE, M_WAITOK | M_ZERO);
1282 	sfxge_lro_init(rxq);
1283 
1284 	callout_init(&rxq->refill_callout, 1);
1285 
1286 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1287 
1288 	return (0);
1289 }
1290 
1291 static const struct {
1292 	const char *name;
1293 	size_t offset;
1294 } sfxge_rx_stats[] = {
1295 #define	SFXGE_RX_STAT(name, member) \
1296 	{ #name, offsetof(struct sfxge_rxq, member) }
1297 #ifdef SFXGE_LRO
1298 	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1299 	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1300 	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1301 	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1302 	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1303 	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1304 	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1305 	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1306 #endif
1307 };
1308 
1309 static int
1310 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1311 {
1312 	struct sfxge_softc *sc = arg1;
1313 	unsigned int id = arg2;
1314 	unsigned int sum, index;
1315 
1316 	/* Sum across all RX queues */
1317 	sum = 0;
1318 	for (index = 0; index < sc->rxq_count; index++)
1319 		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1320 					 sfxge_rx_stats[id].offset);
1321 
1322 	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1323 }
1324 
1325 static void
1326 sfxge_rx_stat_init(struct sfxge_softc *sc)
1327 {
1328 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1329 	struct sysctl_oid_list *stat_list;
1330 	unsigned int id;
1331 
1332 	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1333 
1334 	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1335 		SYSCTL_ADD_PROC(
1336 			ctx, stat_list,
1337 			OID_AUTO, sfxge_rx_stats[id].name,
1338 			CTLTYPE_UINT|CTLFLAG_RD,
1339 			sc, id, sfxge_rx_stat_handler, "IU",
1340 			"");
1341 	}
1342 }
1343 
1344 void
1345 sfxge_rx_fini(struct sfxge_softc *sc)
1346 {
1347 	int index;
1348 
1349 	index = sc->rxq_count;
1350 	while (--index >= 0)
1351 		sfxge_rx_qfini(sc, index);
1352 
1353 	sc->rxq_count = 0;
1354 }
1355 
1356 int
1357 sfxge_rx_init(struct sfxge_softc *sc)
1358 {
1359 	struct sfxge_intr *intr;
1360 	int index;
1361 	int rc;
1362 
1363 #ifdef SFXGE_LRO
1364 	if (!ISP2(lro_table_size)) {
1365 		log(LOG_ERR, "%s=%u must be power of 2",
1366 		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1367 		rc = EINVAL;
1368 		goto fail_lro_table_size;
1369 	}
1370 
1371 	if (lro_idle_ticks == 0)
1372 		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1373 #endif
1374 
1375 	intr = &sc->intr;
1376 
1377 	sc->rxq_count = intr->n_alloc;
1378 
1379 	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1380 	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1381 
1382 	/* Initialize the receive queue(s) - one per interrupt. */
1383 	for (index = 0; index < sc->rxq_count; index++) {
1384 		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1385 			goto fail;
1386 	}
1387 
1388 	sfxge_rx_stat_init(sc);
1389 
1390 	return (0);
1391 
1392 fail:
1393 	/* Tear down the receive queue(s). */
1394 	while (--index >= 0)
1395 		sfxge_rx_qfini(sc, index);
1396 
1397 	sc->rxq_count = 0;
1398 
1399 #ifdef SFXGE_LRO
1400 fail_lro_table_size:
1401 #endif
1402 	return (rc);
1403 }
1404