xref: /freebsd/sys/dev/sfxge/sfxge_rx.c (revision d13def78ccef6dbc25c2e197089ee5fc4d7b82c3)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2010-2016 Solarflare Communications Inc.
5  * All rights reserved.
6  *
7  * This software was developed in part by Philip Paeps under contract for
8  * Solarflare Communications, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright notice,
14  *    this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright notice,
16  *    this list of conditions and the following disclaimer in the documentation
17  *    and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
23  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * The views and conclusions contained in the software and documentation are
32  * those of the authors and should not be interpreted as representing official
33  * policies, either expressed or implied, of the FreeBSD Project.
34  */
35 
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38 
39 #include "opt_rss.h"
40 
41 #include <sys/param.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/smp.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/limits.h>
49 #include <sys/syslog.h>
50 
51 #include <net/ethernet.h>
52 #include <net/if.h>
53 #include <net/if_vlan_var.h>
54 
55 #include <netinet/in.h>
56 #include <netinet/ip.h>
57 #include <netinet/ip6.h>
58 #include <netinet/tcp.h>
59 
60 #include <machine/in_cksum.h>
61 
62 #ifdef RSS
63 #include <net/rss_config.h>
64 #endif
65 
66 #include "common/efx.h"
67 
68 
69 #include "sfxge.h"
70 #include "sfxge_rx.h"
71 
72 #define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
73 
74 #ifdef SFXGE_LRO
75 
76 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
77     "Large receive offload (LRO) parameters");
78 
79 #define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
80 
81 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
82  * means we can accelerate a larger number of streams.
83  */
84 static unsigned lro_table_size = 128;
85 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
86 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
87 	    &lro_table_size, 0,
88 	    "Size of the LRO hash table (must be a power of 2)");
89 
90 /* Maximum length of a hash chain.  If chains get too long then the lookup
91  * time increases and may exceed the benefit of LRO.
92  */
93 static unsigned lro_chain_max = 20;
94 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
95 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
96 	    &lro_chain_max, 0,
97 	    "The maximum length of a hash chain");
98 
99 /* Maximum time (in ticks) that a connection can be idle before it's LRO
100  * state is discarded.
101  */
102 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
103 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
104 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
105 	    &lro_idle_ticks, 0,
106 	    "The maximum time (in ticks) that a connection can be idle "
107 	    "before it's LRO state is discarded");
108 
109 /* Number of packets with payload that must arrive in-order before a
110  * connection is eligible for LRO.  The idea is we should avoid coalescing
111  * segments when the sender is in slow-start because reducing the ACK rate
112  * can damage performance.
113  */
114 static int lro_slow_start_packets = 2000;
115 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
116 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
117 	    &lro_slow_start_packets, 0,
118 	    "Number of packets with payload that must arrive in-order before "
119 	    "a connection is eligible for LRO");
120 
121 /* Number of packets with payload that must arrive in-order following loss
122  * before a connection is eligible for LRO.  The idea is we should avoid
123  * coalescing segments when the sender is recovering from loss, because
124  * reducing the ACK rate can damage performance.
125  */
126 static int lro_loss_packets = 20;
127 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
128 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
129 	    &lro_loss_packets, 0,
130 	    "Number of packets with payload that must arrive in-order "
131 	    "following loss before a connection is eligible for LRO");
132 
133 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
134 #define	SFXGE_LRO_L2_ID_VLAN 0x4000
135 #define	SFXGE_LRO_L2_ID_IPV6 0x8000
136 #define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
137 #define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
138 
139 /* Compare IPv6 addresses, avoiding conditional branches */
140 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
141 				   const struct in6_addr *right)
142 {
143 #if LONG_BIT == 64
144 	const uint64_t *left64 = (const uint64_t *)left;
145 	const uint64_t *right64 = (const uint64_t *)right;
146 	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
147 #else
148 	return (left->s6_addr32[0] - right->s6_addr32[0]) |
149 	       (left->s6_addr32[1] - right->s6_addr32[1]) |
150 	       (left->s6_addr32[2] - right->s6_addr32[2]) |
151 	       (left->s6_addr32[3] - right->s6_addr32[3]);
152 #endif
153 }
154 
155 #endif	/* SFXGE_LRO */
156 
157 void
158 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
159 {
160 
161 	rxq->flush_state = SFXGE_FLUSH_DONE;
162 }
163 
164 void
165 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
166 {
167 
168 	rxq->flush_state = SFXGE_FLUSH_FAILED;
169 }
170 
171 #ifdef RSS
172 static uint8_t toep_key[RSS_KEYSIZE];
173 #else
174 static uint8_t toep_key[] = {
175 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
176 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
177 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
178 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
179 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
180 };
181 #endif
182 
183 static void
184 sfxge_rx_post_refill(void *arg)
185 {
186 	struct sfxge_rxq *rxq = arg;
187 	struct sfxge_softc *sc;
188 	unsigned int index;
189 	struct sfxge_evq *evq;
190 	uint16_t magic;
191 
192 	sc = rxq->sc;
193 	index = rxq->index;
194 	evq = sc->evq[index];
195 	magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
196 
197 	/* This is guaranteed due to the start/stop order of rx and ev */
198 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
199 	    ("evq not started"));
200 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
201 	    ("rxq not started"));
202 	efx_ev_qpost(evq->common, magic);
203 }
204 
205 static void
206 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
207 {
208 	/* Initially retry after 100 ms, but back off in case of
209 	 * repeated failures as we probably have to wait for the
210 	 * administrator to raise the pool limit. */
211 	if (retrying)
212 		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
213 	else
214 		rxq->refill_delay = hz / 10;
215 
216 	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
217 			     sfxge_rx_post_refill, rxq);
218 }
219 
220 #define	SFXGE_REFILL_BATCH  64
221 
222 static void
223 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
224 {
225 	struct sfxge_softc *sc;
226 	unsigned int index;
227 	struct sfxge_evq *evq;
228 	unsigned int batch;
229 	unsigned int rxfill;
230 	unsigned int mblksize;
231 	int ntodo;
232 	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
233 
234 	sc = rxq->sc;
235 	index = rxq->index;
236 	evq = sc->evq[index];
237 
238 	prefetch_read_many(sc->enp);
239 	prefetch_read_many(rxq->common);
240 
241 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
242 
243 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
244 		return;
245 
246 	rxfill = rxq->added - rxq->completed;
247 	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
248 	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
249 	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
250 	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
251 	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
252 
253 	if (ntodo == 0)
254 		return;
255 
256 	batch = 0;
257 	mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
258 	while (ntodo-- > 0) {
259 		unsigned int id;
260 		struct sfxge_rx_sw_desc *rx_desc;
261 		bus_dma_segment_t seg;
262 		struct mbuf *m;
263 
264 		id = (rxq->added + batch) & rxq->ptr_mask;
265 		rx_desc = &rxq->queue[id];
266 		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
267 
268 		rx_desc->flags = EFX_DISCARD;
269 		m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
270 		    sc->rx_cluster_size);
271 		if (m == NULL)
272 			break;
273 
274 		/* m_len specifies length of area to be mapped for DMA */
275 		m->m_len  = mblksize;
276 		m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data,
277 						   CACHE_LINE_SIZE);
278 		m->m_data += sc->rx_buffer_align;
279 
280 		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
281 		addr[batch++] = seg.ds_addr;
282 
283 		if (batch == SFXGE_REFILL_BATCH) {
284 			efx_rx_qpost(rxq->common, addr, mblksize, batch,
285 			    rxq->completed, rxq->added);
286 			rxq->added += batch;
287 			batch = 0;
288 		}
289 	}
290 
291 	if (ntodo != 0)
292 		sfxge_rx_schedule_refill(rxq, retrying);
293 
294 	if (batch != 0) {
295 		efx_rx_qpost(rxq->common, addr, mblksize, batch,
296 		    rxq->completed, rxq->added);
297 		rxq->added += batch;
298 	}
299 
300 	/* Make the descriptors visible to the hardware */
301 	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
302 			BUS_DMASYNC_PREWRITE);
303 
304 	efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
305 
306 	/* The queue could still be empty if no descriptors were actually
307 	 * pushed, in which case there will be no event to cause the next
308 	 * refill, so we must schedule a refill ourselves.
309 	 */
310 	if(rxq->pushed == rxq->completed) {
311 		sfxge_rx_schedule_refill(rxq, retrying);
312 	}
313 }
314 
315 void
316 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
317 {
318 
319 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
320 		return;
321 
322 	/* Make sure the queue is full */
323 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
324 }
325 
326 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
327 {
328 	struct ifnet *ifp = sc->ifnet;
329 
330 	m->m_pkthdr.rcvif = ifp;
331 	m->m_pkthdr.csum_data = 0xffff;
332 	ifp->if_input(ifp, m);
333 }
334 
335 static void
336 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
337 {
338 	struct sfxge_softc *sc = rxq->sc;
339 	struct mbuf *m = rx_desc->mbuf;
340 	int flags = rx_desc->flags;
341 	int csum_flags;
342 
343 	/* Convert checksum flags */
344 	csum_flags = (flags & EFX_CKSUM_IPV4) ?
345 		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
346 	if (flags & EFX_CKSUM_TCPUDP)
347 		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
348 
349 	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
350 		m->m_pkthdr.flowid =
351 			efx_pseudo_hdr_hash_get(rxq->common,
352 						EFX_RX_HASHALG_TOEPLITZ,
353 						mtod(m, uint8_t *));
354 		/* The hash covers a 4-tuple for TCP only */
355 		M_HASHTYPE_SET(m,
356 		    (flags & EFX_PKT_IPV4) ?
357 			((flags & EFX_PKT_TCP) ?
358 			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
359 			((flags & EFX_PKT_TCP) ?
360 			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
361 	}
362 	m->m_data += sc->rx_prefix_size;
363 	m->m_len = rx_desc->size - sc->rx_prefix_size;
364 	m->m_pkthdr.len = m->m_len;
365 	m->m_pkthdr.csum_flags = csum_flags;
366 	__sfxge_rx_deliver(sc, rx_desc->mbuf);
367 
368 	rx_desc->flags = EFX_DISCARD;
369 	rx_desc->mbuf = NULL;
370 }
371 
372 #ifdef SFXGE_LRO
373 
374 static void
375 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
376 {
377 	struct sfxge_softc *sc = st->sc;
378 	struct mbuf *m = c->mbuf;
379 	struct tcphdr *c_th;
380 	int csum_flags;
381 
382 	KASSERT(m, ("no mbuf to deliver"));
383 
384 	++st->n_bursts;
385 
386 	/* Finish off packet munging and recalculate IP header checksum. */
387 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
388 		struct ip *iph = c->nh;
389 		iph->ip_len = htons(iph->ip_len);
390 		iph->ip_sum = 0;
391 		iph->ip_sum = in_cksum_hdr(iph);
392 		c_th = (struct tcphdr *)(iph + 1);
393 		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
394 			      CSUM_IP_CHECKED | CSUM_IP_VALID);
395 	} else {
396 		struct ip6_hdr *iph = c->nh;
397 		iph->ip6_plen = htons(iph->ip6_plen);
398 		c_th = (struct tcphdr *)(iph + 1);
399 		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
400 	}
401 
402 	c_th->th_win = c->th_last->th_win;
403 	c_th->th_ack = c->th_last->th_ack;
404 	if (c_th->th_off == c->th_last->th_off) {
405 		/* Copy TCP options (take care to avoid going negative). */
406 		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
407 		memcpy(c_th + 1, c->th_last + 1, optlen);
408 	}
409 
410 	m->m_pkthdr.flowid = c->conn_hash;
411 	M_HASHTYPE_SET(m,
412 	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
413 		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
414 
415 	m->m_pkthdr.csum_flags = csum_flags;
416 	__sfxge_rx_deliver(sc, m);
417 
418 	c->mbuf = NULL;
419 	c->delivered = 1;
420 }
421 
422 /* Drop the given connection, and add it to the free list. */
423 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
424 {
425 	unsigned bucket;
426 
427 	KASSERT(!c->mbuf, ("found orphaned mbuf"));
428 
429 	if (c->next_buf.mbuf != NULL) {
430 		sfxge_rx_deliver(rxq, &c->next_buf);
431 		LIST_REMOVE(c, active_link);
432 	}
433 
434 	bucket = c->conn_hash & rxq->lro.conns_mask;
435 	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
436 	--rxq->lro.conns_n[bucket];
437 	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
438 	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
439 }
440 
441 /* Stop tracking connections that have gone idle in order to keep hash
442  * chains short.
443  */
444 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
445 {
446 	struct sfxge_lro_conn *c;
447 	unsigned i;
448 
449 	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
450 		("found active connections"));
451 
452 	rxq->lro.last_purge_ticks = now;
453 	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
454 		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
455 			continue;
456 
457 		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
458 		if (now - c->last_pkt_ticks > lro_idle_ticks) {
459 			++rxq->lro.n_drop_idle;
460 			sfxge_lro_drop(rxq, c);
461 		}
462 	}
463 }
464 
465 static void
466 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
467 		struct mbuf *mbuf, struct tcphdr *th)
468 {
469 	struct tcphdr *c_th;
470 
471 	/* Tack the new mbuf onto the chain. */
472 	KASSERT(!mbuf->m_next, ("mbuf already chained"));
473 	c->mbuf_tail->m_next = mbuf;
474 	c->mbuf_tail = mbuf;
475 
476 	/* Increase length appropriately */
477 	c->mbuf->m_pkthdr.len += mbuf->m_len;
478 
479 	/* Update the connection state flags */
480 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
481 		struct ip *iph = c->nh;
482 		iph->ip_len += mbuf->m_len;
483 		c_th = (struct tcphdr *)(iph + 1);
484 	} else {
485 		struct ip6_hdr *iph = c->nh;
486 		iph->ip6_plen += mbuf->m_len;
487 		c_th = (struct tcphdr *)(iph + 1);
488 	}
489 	c_th->th_flags |= (th->th_flags & TH_PUSH);
490 	c->th_last = th;
491 	++st->n_merges;
492 
493 	/* Pass packet up now if another segment could overflow the IP
494 	 * length.
495 	 */
496 	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
497 		sfxge_lro_deliver(st, c);
498 }
499 
500 static void
501 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
502 		struct mbuf *mbuf, void *nh, struct tcphdr *th)
503 {
504 	/* Start the chain */
505 	c->mbuf = mbuf;
506 	c->mbuf_tail = c->mbuf;
507 	c->nh = nh;
508 	c->th_last = th;
509 
510 	mbuf->m_pkthdr.len = mbuf->m_len;
511 
512 	/* Mangle header fields for later processing */
513 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
514 		struct ip *iph = nh;
515 		iph->ip_len = ntohs(iph->ip_len);
516 	} else {
517 		struct ip6_hdr *iph = nh;
518 		iph->ip6_plen = ntohs(iph->ip6_plen);
519 	}
520 }
521 
522 /* Try to merge or otherwise hold or deliver (as appropriate) the
523  * packet buffered for this connection (c->next_buf).  Return a flag
524  * indicating whether the connection is still active for LRO purposes.
525  */
526 static int
527 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
528 {
529 	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
530 	char *eh = c->next_eh;
531 	int data_length, hdr_length, dont_merge;
532 	unsigned th_seq, pkt_length;
533 	struct tcphdr *th;
534 	unsigned now;
535 
536 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
537 		struct ip *iph = c->next_nh;
538 		th = (struct tcphdr *)(iph + 1);
539 		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
540 	} else {
541 		struct ip6_hdr *iph = c->next_nh;
542 		th = (struct tcphdr *)(iph + 1);
543 		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
544 	}
545 
546 	hdr_length = (char *) th + th->th_off * 4 - eh;
547 	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
548 		       hdr_length);
549 	th_seq = ntohl(th->th_seq);
550 	dont_merge = ((data_length <= 0)
551 		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
552 
553 	/* Check for options other than aligned timestamp. */
554 	if (th->th_off != 5) {
555 		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
556 		if (th->th_off == 8 &&
557 		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
558 					(TCPOPT_NOP << 16) |
559 					(TCPOPT_TIMESTAMP << 8) |
560 					TCPOLEN_TIMESTAMP)) {
561 			/* timestamp option -- okay */
562 		} else {
563 			dont_merge = 1;
564 		}
565 	}
566 
567 	if (__predict_false(th_seq != c->next_seq)) {
568 		/* Out-of-order, so start counting again. */
569 		if (c->mbuf != NULL)
570 			sfxge_lro_deliver(&rxq->lro, c);
571 		c->n_in_order_pkts -= lro_loss_packets;
572 		c->next_seq = th_seq + data_length;
573 		++rxq->lro.n_misorder;
574 		goto deliver_buf_out;
575 	}
576 	c->next_seq = th_seq + data_length;
577 
578 	now = ticks;
579 	if (now - c->last_pkt_ticks > lro_idle_ticks) {
580 		++rxq->lro.n_drop_idle;
581 		if (c->mbuf != NULL)
582 			sfxge_lro_deliver(&rxq->lro, c);
583 		sfxge_lro_drop(rxq, c);
584 		return (0);
585 	}
586 	c->last_pkt_ticks = ticks;
587 
588 	if (c->n_in_order_pkts < lro_slow_start_packets) {
589 		/* May be in slow-start, so don't merge. */
590 		++rxq->lro.n_slow_start;
591 		++c->n_in_order_pkts;
592 		goto deliver_buf_out;
593 	}
594 
595 	if (__predict_false(dont_merge)) {
596 		if (c->mbuf != NULL)
597 			sfxge_lro_deliver(&rxq->lro, c);
598 		if (th->th_flags & (TH_FIN | TH_RST)) {
599 			++rxq->lro.n_drop_closed;
600 			sfxge_lro_drop(rxq, c);
601 			return (0);
602 		}
603 		goto deliver_buf_out;
604 	}
605 
606 	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
607 
608 	if (__predict_true(c->mbuf != NULL)) {
609 		/* Remove headers and any padding */
610 		rx_buf->mbuf->m_data += hdr_length;
611 		rx_buf->mbuf->m_len = data_length;
612 
613 		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
614 	} else {
615 		/* Remove any padding */
616 		rx_buf->mbuf->m_len = pkt_length;
617 
618 		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
619 	}
620 
621 	rx_buf->mbuf = NULL;
622 	return (1);
623 
624  deliver_buf_out:
625 	sfxge_rx_deliver(rxq, rx_buf);
626 	return (1);
627 }
628 
629 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
630 			       uint16_t l2_id, void *nh, struct tcphdr *th)
631 {
632 	unsigned bucket = conn_hash & st->conns_mask;
633 	struct sfxge_lro_conn *c;
634 
635 	if (st->conns_n[bucket] >= lro_chain_max) {
636 		++st->n_too_many;
637 		return;
638 	}
639 
640 	if (!TAILQ_EMPTY(&st->free_conns)) {
641 		c = TAILQ_FIRST(&st->free_conns);
642 		TAILQ_REMOVE(&st->free_conns, c, link);
643 	} else {
644 		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
645 		if (c == NULL)
646 			return;
647 		c->mbuf = NULL;
648 		c->next_buf.mbuf = NULL;
649 	}
650 
651 	/* Create the connection tracking data */
652 	++st->conns_n[bucket];
653 	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
654 	c->l2_id = l2_id;
655 	c->conn_hash = conn_hash;
656 	c->source = th->th_sport;
657 	c->dest = th->th_dport;
658 	c->n_in_order_pkts = 0;
659 	c->last_pkt_ticks = *(volatile int *)&ticks;
660 	c->delivered = 0;
661 	++st->n_new_stream;
662 	/* NB. We don't initialise c->next_seq, and it doesn't matter what
663 	 * value it has.  Most likely the next packet received for this
664 	 * connection will not match -- no harm done.
665 	 */
666 }
667 
668 /* Process mbuf and decide whether to dispatch it to the stack now or
669  * later.
670  */
671 static void
672 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
673 {
674 	struct sfxge_softc *sc = rxq->sc;
675 	struct mbuf *m = rx_buf->mbuf;
676 	struct ether_header *eh;
677 	struct sfxge_lro_conn *c;
678 	uint16_t l2_id;
679 	uint16_t l3_proto;
680 	void *nh;
681 	struct tcphdr *th;
682 	uint32_t conn_hash;
683 	unsigned bucket;
684 
685 	/* Get the hardware hash */
686 	conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
687 					    EFX_RX_HASHALG_TOEPLITZ,
688 					    mtod(m, uint8_t *));
689 
690 	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
691 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
692 		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
693 		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
694 			SFXGE_LRO_L2_ID_VLAN;
695 		l3_proto = veh->evl_proto;
696 		nh = veh + 1;
697 	} else {
698 		l2_id = 0;
699 		l3_proto = eh->ether_type;
700 		nh = eh + 1;
701 	}
702 
703 	/* Check whether this is a suitable packet (unfragmented
704 	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
705 	 * length, and compute a hash if necessary.  If not, return.
706 	 */
707 	if (l3_proto == htons(ETHERTYPE_IP)) {
708 		struct ip *iph = nh;
709 
710 		KASSERT(iph->ip_p == IPPROTO_TCP,
711 		    ("IPv4 protocol is not TCP, but packet marker is set"));
712 		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
713 		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
714 			goto deliver_now;
715 		th = (struct tcphdr *)(iph + 1);
716 	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
717 		struct ip6_hdr *iph = nh;
718 
719 		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
720 		    ("IPv6 next header is not TCP, but packet marker is set"));
721 		l2_id |= SFXGE_LRO_L2_ID_IPV6;
722 		th = (struct tcphdr *)(iph + 1);
723 	} else {
724 		goto deliver_now;
725 	}
726 
727 	bucket = conn_hash & rxq->lro.conns_mask;
728 
729 	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
730 		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
731 			continue;
732 		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
733 			continue;
734 		if (c->mbuf != NULL) {
735 			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
736 				struct ip *c_iph, *iph = nh;
737 				c_iph = c->nh;
738 				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
739 				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
740 					continue;
741 			} else {
742 				struct ip6_hdr *c_iph, *iph = nh;
743 				c_iph = c->nh;
744 				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
745 				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
746 					continue;
747 			}
748 		}
749 
750 		/* Re-insert at head of list to reduce lookup time. */
751 		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
752 		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
753 
754 		if (c->next_buf.mbuf != NULL) {
755 			if (!sfxge_lro_try_merge(rxq, c))
756 				goto deliver_now;
757 		} else {
758 			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
759 			    active_link);
760 		}
761 		c->next_buf = *rx_buf;
762 		c->next_eh = eh;
763 		c->next_nh = nh;
764 
765 		rx_buf->mbuf = NULL;
766 		rx_buf->flags = EFX_DISCARD;
767 		return;
768 	}
769 
770 	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
771  deliver_now:
772 	sfxge_rx_deliver(rxq, rx_buf);
773 }
774 
775 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
776 {
777 	struct sfxge_lro_state *st = &rxq->lro;
778 	struct sfxge_lro_conn *c;
779 	unsigned t;
780 
781 	while (!LIST_EMPTY(&st->active_conns)) {
782 		c = LIST_FIRST(&st->active_conns);
783 		if (!c->delivered && c->mbuf != NULL)
784 			sfxge_lro_deliver(st, c);
785 		if (sfxge_lro_try_merge(rxq, c)) {
786 			if (c->mbuf != NULL)
787 				sfxge_lro_deliver(st, c);
788 			LIST_REMOVE(c, active_link);
789 		}
790 		c->delivered = 0;
791 	}
792 
793 	t = *(volatile int *)&ticks;
794 	if (__predict_false(t != st->last_purge_ticks))
795 		sfxge_lro_purge_idle(rxq, t);
796 }
797 
798 #else	/* !SFXGE_LRO */
799 
800 static void
801 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
802 {
803 }
804 
805 static void
806 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
807 {
808 }
809 
810 #endif	/* SFXGE_LRO */
811 
812 void
813 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
814 {
815 	struct sfxge_softc *sc = rxq->sc;
816 	int if_capenable = sc->ifnet->if_capenable;
817 	int lro_enabled = if_capenable & IFCAP_LRO;
818 	unsigned int index;
819 	struct sfxge_evq *evq;
820 	unsigned int completed;
821 	unsigned int level;
822 	struct mbuf *m;
823 	struct sfxge_rx_sw_desc *prev = NULL;
824 
825 	index = rxq->index;
826 	evq = sc->evq[index];
827 
828 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
829 
830 	completed = rxq->completed;
831 	while (completed != rxq->pending) {
832 		unsigned int id;
833 		struct sfxge_rx_sw_desc *rx_desc;
834 
835 		id = completed++ & rxq->ptr_mask;
836 		rx_desc = &rxq->queue[id];
837 		m = rx_desc->mbuf;
838 
839 		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
840 			goto discard;
841 
842 		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
843 			goto discard;
844 
845 		/* Read the length from the pseudo header if required */
846 		if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
847 			uint16_t tmp_size;
848 			int rc;
849 			rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
850 							   mtod(m, uint8_t *),
851 							   &tmp_size);
852 			KASSERT(rc == 0, ("cannot get packet length: %d", rc));
853 			rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
854 		}
855 
856 		prefetch_read_many(mtod(m, caddr_t));
857 
858 		switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
859 		case EFX_PKT_IPV4:
860 			if (~if_capenable & IFCAP_RXCSUM)
861 				rx_desc->flags &=
862 				    ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
863 			break;
864 		case EFX_PKT_IPV6:
865 			if (~if_capenable & IFCAP_RXCSUM_IPV6)
866 				rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
867 			break;
868 		case 0:
869 			/* Check for loopback packets */
870 			{
871 				struct ether_header *etherhp;
872 
873 				/*LINTED*/
874 				etherhp = mtod(m, struct ether_header *);
875 
876 				if (etherhp->ether_type ==
877 				    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
878 					EFSYS_PROBE(loopback);
879 
880 					rxq->loopback++;
881 					goto discard;
882 				}
883 			}
884 			break;
885 		default:
886 			KASSERT(B_FALSE,
887 			    ("Rx descriptor with both IPv4 and IPv6 flags"));
888 			goto discard;
889 		}
890 
891 		/* Pass packet up the stack or into LRO (pipelined) */
892 		if (prev != NULL) {
893 			if (lro_enabled &&
894 			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
895 			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
896 				sfxge_lro(rxq, prev);
897 			else
898 				sfxge_rx_deliver(rxq, prev);
899 		}
900 		prev = rx_desc;
901 		continue;
902 
903 discard:
904 		/* Return the packet to the pool */
905 		m_free(m);
906 		rx_desc->mbuf = NULL;
907 	}
908 	rxq->completed = completed;
909 
910 	level = rxq->added - rxq->completed;
911 
912 	/* Pass last packet up the stack or into LRO */
913 	if (prev != NULL) {
914 		if (lro_enabled &&
915 		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
916 		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
917 			sfxge_lro(rxq, prev);
918 		else
919 			sfxge_rx_deliver(rxq, prev);
920 	}
921 
922 	/*
923 	 * If there are any pending flows and this is the end of the
924 	 * poll then they must be completed.
925 	 */
926 	if (eop)
927 		sfxge_lro_end_of_burst(rxq);
928 
929 	/* Top up the queue if necessary */
930 	if (level < rxq->refill_threshold)
931 		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
932 }
933 
934 static void
935 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
936 {
937 	struct sfxge_rxq *rxq;
938 	struct sfxge_evq *evq;
939 	unsigned int count;
940 	unsigned int retry = 3;
941 
942 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
943 
944 	rxq = sc->rxq[index];
945 	evq = sc->evq[index];
946 
947 	SFXGE_EVQ_LOCK(evq);
948 
949 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
950 	    ("rxq not started"));
951 
952 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
953 
954 	callout_stop(&rxq->refill_callout);
955 
956 	while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
957 		rxq->flush_state = SFXGE_FLUSH_PENDING;
958 
959 		SFXGE_EVQ_UNLOCK(evq);
960 
961 		/* Flush the receive queue */
962 		if (efx_rx_qflush(rxq->common) != 0) {
963 			SFXGE_EVQ_LOCK(evq);
964 			rxq->flush_state = SFXGE_FLUSH_FAILED;
965 			break;
966 		}
967 
968 		count = 0;
969 		do {
970 			/* Spin for 100 ms */
971 			DELAY(100000);
972 
973 			if (rxq->flush_state != SFXGE_FLUSH_PENDING)
974 				break;
975 
976 		} while (++count < 20);
977 
978 		SFXGE_EVQ_LOCK(evq);
979 
980 		if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
981 			/* Flush timeout - neither done nor failed */
982 			log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
983 			    device_get_nameunit(sc->dev), index);
984 			rxq->flush_state = SFXGE_FLUSH_DONE;
985 		}
986 		retry--;
987 	}
988 	if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
989 		log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
990 		    device_get_nameunit(sc->dev), index);
991 		rxq->flush_state = SFXGE_FLUSH_DONE;
992 	}
993 
994 	rxq->pending = rxq->added;
995 	sfxge_rx_qcomplete(rxq, B_TRUE);
996 
997 	KASSERT(rxq->completed == rxq->pending,
998 	    ("rxq->completed != rxq->pending"));
999 
1000 	rxq->added = 0;
1001 	rxq->pushed = 0;
1002 	rxq->pending = 0;
1003 	rxq->completed = 0;
1004 	rxq->loopback = 0;
1005 
1006 	/* Destroy the common code receive queue. */
1007 	efx_rx_qdestroy(rxq->common);
1008 
1009 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1010 	    EFX_RXQ_NBUFS(sc->rxq_entries));
1011 
1012 	SFXGE_EVQ_UNLOCK(evq);
1013 }
1014 
1015 static int
1016 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1017 {
1018 	struct sfxge_rxq *rxq;
1019 	efsys_mem_t *esmp;
1020 	struct sfxge_evq *evq;
1021 	int rc;
1022 
1023 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1024 
1025 	rxq = sc->rxq[index];
1026 	esmp = &rxq->mem;
1027 	evq = sc->evq[index];
1028 
1029 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1030 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1031 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1032 	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1033 
1034 	/* Program the buffer table. */
1035 	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1036 	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1037 		return (rc);
1038 
1039 	/* Create the common code receive queue. */
1040 	if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1041 	    esmp, sc->rxq_entries, rxq->buf_base_id, EFX_RXQ_FLAG_NONE,
1042 	    evq->common, &rxq->common)) != 0)
1043 		goto fail;
1044 
1045 	SFXGE_EVQ_LOCK(evq);
1046 
1047 	/* Enable the receive queue. */
1048 	efx_rx_qenable(rxq->common);
1049 
1050 	rxq->init_state = SFXGE_RXQ_STARTED;
1051 	rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1052 
1053 	/* Try to fill the queue from the pool. */
1054 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1055 
1056 	SFXGE_EVQ_UNLOCK(evq);
1057 
1058 	return (0);
1059 
1060 fail:
1061 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1062 	    EFX_RXQ_NBUFS(sc->rxq_entries));
1063 	return (rc);
1064 }
1065 
1066 void
1067 sfxge_rx_stop(struct sfxge_softc *sc)
1068 {
1069 	int index;
1070 
1071 	efx_mac_filter_default_rxq_clear(sc->enp);
1072 
1073 	/* Stop the receive queue(s) */
1074 	index = sc->rxq_count;
1075 	while (--index >= 0)
1076 		sfxge_rx_qstop(sc, index);
1077 
1078 	sc->rx_prefix_size = 0;
1079 	sc->rx_buffer_size = 0;
1080 
1081 	efx_rx_fini(sc->enp);
1082 }
1083 
1084 int
1085 sfxge_rx_start(struct sfxge_softc *sc)
1086 {
1087 	struct sfxge_intr *intr;
1088 	const efx_nic_cfg_t *encp;
1089 	size_t hdrlen, align, reserved;
1090 	int index;
1091 	int rc;
1092 
1093 	intr = &sc->intr;
1094 
1095 	/* Initialize the common code receive module. */
1096 	if ((rc = efx_rx_init(sc->enp)) != 0)
1097 		return (rc);
1098 
1099 	encp = efx_nic_cfg_get(sc->enp);
1100 	sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1101 
1102 	/* Calculate the receive packet buffer size. */
1103 	sc->rx_prefix_size = encp->enc_rx_prefix_size;
1104 
1105 	/* Ensure IP headers are 32bit aligned */
1106 	hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1107 	sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen;
1108 
1109 	sc->rx_buffer_size += sc->rx_buffer_align;
1110 
1111 	/* Align end of packet buffer for RX DMA end padding */
1112 	align = MAX(1, encp->enc_rx_buf_align_end);
1113 	EFSYS_ASSERT(ISP2(align));
1114 	sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align);
1115 
1116 	/*
1117 	 * Standard mbuf zones only guarantee pointer-size alignment;
1118 	 * we need extra space to align to the cache line
1119 	 */
1120 	reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1121 
1122 	/* Select zone for packet buffers */
1123 	if (reserved <= MCLBYTES)
1124 		sc->rx_cluster_size = MCLBYTES;
1125 	else if (reserved <= MJUMPAGESIZE)
1126 		sc->rx_cluster_size = MJUMPAGESIZE;
1127 	else if (reserved <= MJUM9BYTES)
1128 		sc->rx_cluster_size = MJUM9BYTES;
1129 	else
1130 		sc->rx_cluster_size = MJUM16BYTES;
1131 
1132 	/*
1133 	 * Set up the scale table.  Enable all hash types and hash insertion.
1134 	 */
1135 	for (index = 0; index < nitems(sc->rx_indir_table); index++)
1136 #ifdef RSS
1137 		sc->rx_indir_table[index] =
1138 			rss_get_indirection_to_bucket(index) % sc->rxq_count;
1139 #else
1140 		sc->rx_indir_table[index] = index % sc->rxq_count;
1141 #endif
1142 	if ((rc = efx_rx_scale_tbl_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1143 				       sc->rx_indir_table,
1144 				       nitems(sc->rx_indir_table))) != 0)
1145 		goto fail;
1146 	(void)efx_rx_scale_mode_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1147 	    EFX_RX_HASHALG_TOEPLITZ,
1148 	    EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
1149 	    EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
1150 
1151 #ifdef RSS
1152 	rss_getkey(toep_key);
1153 #endif
1154 	if ((rc = efx_rx_scale_key_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1155 				       toep_key,
1156 				       sizeof(toep_key))) != 0)
1157 		goto fail;
1158 
1159 	/* Start the receive queue(s). */
1160 	for (index = 0; index < sc->rxq_count; index++) {
1161 		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1162 			goto fail2;
1163 	}
1164 
1165 	rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1166 					    sc->intr.n_alloc > 1);
1167 	if (rc != 0)
1168 		goto fail3;
1169 
1170 	return (0);
1171 
1172 fail3:
1173 fail2:
1174 	while (--index >= 0)
1175 		sfxge_rx_qstop(sc, index);
1176 
1177 fail:
1178 	efx_rx_fini(sc->enp);
1179 
1180 	return (rc);
1181 }
1182 
1183 #ifdef SFXGE_LRO
1184 
1185 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1186 {
1187 	struct sfxge_lro_state *st = &rxq->lro;
1188 	unsigned i;
1189 
1190 	st->conns_mask = lro_table_size - 1;
1191 	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1192 		("lro_table_size must be a power of 2"));
1193 	st->sc = rxq->sc;
1194 	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1195 			   M_SFXGE, M_WAITOK);
1196 	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1197 			     M_SFXGE, M_WAITOK);
1198 	for (i = 0; i <= st->conns_mask; ++i) {
1199 		TAILQ_INIT(&st->conns[i]);
1200 		st->conns_n[i] = 0;
1201 	}
1202 	LIST_INIT(&st->active_conns);
1203 	TAILQ_INIT(&st->free_conns);
1204 }
1205 
1206 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1207 {
1208 	struct sfxge_lro_state *st = &rxq->lro;
1209 	struct sfxge_lro_conn *c;
1210 	unsigned i;
1211 
1212 	/* Return cleanly if sfxge_lro_init() has not been called. */
1213 	if (st->conns == NULL)
1214 		return;
1215 
1216 	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1217 
1218 	for (i = 0; i <= st->conns_mask; ++i) {
1219 		while (!TAILQ_EMPTY(&st->conns[i])) {
1220 			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1221 			sfxge_lro_drop(rxq, c);
1222 		}
1223 	}
1224 
1225 	while (!TAILQ_EMPTY(&st->free_conns)) {
1226 		c = TAILQ_FIRST(&st->free_conns);
1227 		TAILQ_REMOVE(&st->free_conns, c, link);
1228 		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1229 		free(c, M_SFXGE);
1230 	}
1231 
1232 	free(st->conns_n, M_SFXGE);
1233 	free(st->conns, M_SFXGE);
1234 	st->conns = NULL;
1235 }
1236 
1237 #else
1238 
1239 static void
1240 sfxge_lro_init(struct sfxge_rxq *rxq)
1241 {
1242 }
1243 
1244 static void
1245 sfxge_lro_fini(struct sfxge_rxq *rxq)
1246 {
1247 }
1248 
1249 #endif	/* SFXGE_LRO */
1250 
1251 static void
1252 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1253 {
1254 	struct sfxge_rxq *rxq;
1255 
1256 	rxq = sc->rxq[index];
1257 
1258 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1259 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1260 
1261 	/* Free the context array and the flow table. */
1262 	free(rxq->queue, M_SFXGE);
1263 	sfxge_lro_fini(rxq);
1264 
1265 	/* Release DMA memory. */
1266 	sfxge_dma_free(&rxq->mem);
1267 
1268 	sc->rxq[index] = NULL;
1269 
1270 	free(rxq, M_SFXGE);
1271 }
1272 
1273 static int
1274 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1275 {
1276 	struct sfxge_rxq *rxq;
1277 	struct sfxge_evq *evq;
1278 	efsys_mem_t *esmp;
1279 	int rc;
1280 
1281 	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1282 
1283 	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1284 	rxq->sc = sc;
1285 	rxq->index = index;
1286 	rxq->entries = sc->rxq_entries;
1287 	rxq->ptr_mask = rxq->entries - 1;
1288 	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1289 
1290 	sc->rxq[index] = rxq;
1291 	esmp = &rxq->mem;
1292 
1293 	evq = sc->evq[index];
1294 
1295 	/* Allocate and zero DMA space. */
1296 	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1297 		return (rc);
1298 
1299 	/* Allocate buffer table entries. */
1300 	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1301 				 &rxq->buf_base_id);
1302 
1303 	/* Allocate the context array and the flow table. */
1304 	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1305 	    M_SFXGE, M_WAITOK | M_ZERO);
1306 	sfxge_lro_init(rxq);
1307 
1308 	callout_init(&rxq->refill_callout, 1);
1309 
1310 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1311 
1312 	return (0);
1313 }
1314 
1315 static const struct {
1316 	const char *name;
1317 	size_t offset;
1318 } sfxge_rx_stats[] = {
1319 #define	SFXGE_RX_STAT(name, member) \
1320 	{ #name, offsetof(struct sfxge_rxq, member) }
1321 #ifdef SFXGE_LRO
1322 	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1323 	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1324 	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1325 	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1326 	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1327 	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1328 	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1329 	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1330 #endif
1331 };
1332 
1333 static int
1334 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1335 {
1336 	struct sfxge_softc *sc = arg1;
1337 	unsigned int id = arg2;
1338 	unsigned int sum, index;
1339 
1340 	/* Sum across all RX queues */
1341 	sum = 0;
1342 	for (index = 0; index < sc->rxq_count; index++)
1343 		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1344 					 sfxge_rx_stats[id].offset);
1345 
1346 	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1347 }
1348 
1349 static void
1350 sfxge_rx_stat_init(struct sfxge_softc *sc)
1351 {
1352 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1353 	struct sysctl_oid_list *stat_list;
1354 	unsigned int id;
1355 
1356 	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1357 
1358 	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1359 		SYSCTL_ADD_PROC(ctx, stat_list, OID_AUTO,
1360 		    sfxge_rx_stats[id].name,
1361 		    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
1362 		    sc, id, sfxge_rx_stat_handler, "IU", "");
1363 	}
1364 }
1365 
1366 void
1367 sfxge_rx_fini(struct sfxge_softc *sc)
1368 {
1369 	int index;
1370 
1371 	index = sc->rxq_count;
1372 	while (--index >= 0)
1373 		sfxge_rx_qfini(sc, index);
1374 
1375 	sc->rxq_count = 0;
1376 }
1377 
1378 int
1379 sfxge_rx_init(struct sfxge_softc *sc)
1380 {
1381 	struct sfxge_intr *intr;
1382 	int index;
1383 	int rc;
1384 
1385 #ifdef SFXGE_LRO
1386 	if (!ISP2(lro_table_size)) {
1387 		log(LOG_ERR, "%s=%u must be power of 2",
1388 		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1389 		rc = EINVAL;
1390 		goto fail_lro_table_size;
1391 	}
1392 
1393 	if (lro_idle_ticks == 0)
1394 		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1395 #endif
1396 
1397 	intr = &sc->intr;
1398 
1399 	sc->rxq_count = intr->n_alloc;
1400 
1401 	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1402 	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1403 
1404 	/* Initialize the receive queue(s) - one per interrupt. */
1405 	for (index = 0; index < sc->rxq_count; index++) {
1406 		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1407 			goto fail;
1408 	}
1409 
1410 	sfxge_rx_stat_init(sc);
1411 
1412 	return (0);
1413 
1414 fail:
1415 	/* Tear down the receive queue(s). */
1416 	while (--index >= 0)
1417 		sfxge_rx_qfini(sc, index);
1418 
1419 	sc->rxq_count = 0;
1420 
1421 #ifdef SFXGE_LRO
1422 fail_lro_table_size:
1423 #endif
1424 	return (rc);
1425 }
1426