xref: /freebsd/sys/dev/sfxge/sfxge_rx.c (revision 7d8f797b725e3efc0a4256554654780df83c456c)
1 /*-
2  * Copyright (c) 2010-2015 Solarflare Communications Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright notice,
12  *    this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  *    this list of conditions and the following disclaimer in the documentation
15  *    and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  * The views and conclusions contained in the software and documentation are
30  * those of the authors and should not be interpreted as representing official
31  * policies, either expressed or implied, of the FreeBSD Project.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/types.h>
38 #include <sys/mbuf.h>
39 #include <sys/smp.h>
40 #include <sys/socket.h>
41 #include <sys/sysctl.h>
42 #include <sys/syslog.h>
43 #include <sys/limits.h>
44 #include <sys/syslog.h>
45 
46 #include <net/ethernet.h>
47 #include <net/if.h>
48 #include <net/if_vlan_var.h>
49 
50 #include <netinet/in.h>
51 #include <netinet/ip.h>
52 #include <netinet/ip6.h>
53 #include <netinet/tcp.h>
54 
55 #include <machine/in_cksum.h>
56 
57 #include "common/efx.h"
58 
59 
60 #include "sfxge.h"
61 #include "sfxge_rx.h"
62 
63 #define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
64 
65 #ifdef SFXGE_LRO
66 
67 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
68 	    "Large receive offload (LRO) parameters");
69 
70 #define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
71 
72 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
73  * means we can accelerate a larger number of streams.
74  */
75 static unsigned lro_table_size = 128;
76 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
77 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
78 	    &lro_table_size, 0,
79 	    "Size of the LRO hash table (must be a power of 2)");
80 
81 /* Maximum length of a hash chain.  If chains get too long then the lookup
82  * time increases and may exceed the benefit of LRO.
83  */
84 static unsigned lro_chain_max = 20;
85 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
86 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
87 	    &lro_chain_max, 0,
88 	    "The maximum length of a hash chain");
89 
90 /* Maximum time (in ticks) that a connection can be idle before it's LRO
91  * state is discarded.
92  */
93 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
94 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
95 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
96 	    &lro_idle_ticks, 0,
97 	    "The maximum time (in ticks) that a connection can be idle "
98 	    "before it's LRO state is discarded");
99 
100 /* Number of packets with payload that must arrive in-order before a
101  * connection is eligible for LRO.  The idea is we should avoid coalescing
102  * segments when the sender is in slow-start because reducing the ACK rate
103  * can damage performance.
104  */
105 static int lro_slow_start_packets = 2000;
106 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
107 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
108 	    &lro_slow_start_packets, 0,
109 	    "Number of packets with payload that must arrive in-order before "
110 	    "a connection is eligible for LRO");
111 
112 /* Number of packets with payload that must arrive in-order following loss
113  * before a connection is eligible for LRO.  The idea is we should avoid
114  * coalescing segments when the sender is recovering from loss, because
115  * reducing the ACK rate can damage performance.
116  */
117 static int lro_loss_packets = 20;
118 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
119 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
120 	    &lro_loss_packets, 0,
121 	    "Number of packets with payload that must arrive in-order "
122 	    "following loss before a connection is eligible for LRO");
123 
124 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
125 #define	SFXGE_LRO_L2_ID_VLAN 0x4000
126 #define	SFXGE_LRO_L2_ID_IPV6 0x8000
127 #define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
128 #define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
129 
130 /* Compare IPv6 addresses, avoiding conditional branches */
131 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
132 				   const struct in6_addr *right)
133 {
134 #if LONG_BIT == 64
135 	const uint64_t *left64 = (const uint64_t *)left;
136 	const uint64_t *right64 = (const uint64_t *)right;
137 	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
138 #else
139 	return (left->s6_addr32[0] - right->s6_addr32[0]) |
140 	       (left->s6_addr32[1] - right->s6_addr32[1]) |
141 	       (left->s6_addr32[2] - right->s6_addr32[2]) |
142 	       (left->s6_addr32[3] - right->s6_addr32[3]);
143 #endif
144 }
145 
146 #endif	/* SFXGE_LRO */
147 
148 void
149 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
150 {
151 
152 	rxq->flush_state = SFXGE_FLUSH_DONE;
153 }
154 
155 void
156 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
157 {
158 
159 	rxq->flush_state = SFXGE_FLUSH_FAILED;
160 }
161 
162 static uint8_t toep_key[] = {
163 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
164 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
165 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
166 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
167 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
168 };
169 
170 static void
171 sfxge_rx_post_refill(void *arg)
172 {
173 	struct sfxge_rxq *rxq = arg;
174 	struct sfxge_softc *sc;
175 	unsigned int index;
176 	struct sfxge_evq *evq;
177 	uint16_t magic;
178 
179 	sc = rxq->sc;
180 	index = rxq->index;
181 	evq = sc->evq[index];
182 
183 	magic = SFXGE_MAGIC_RX_QREFILL | index;
184 
185 	/* This is guaranteed due to the start/stop order of rx and ev */
186 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
187 	    ("evq not started"));
188 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
189 	    ("rxq not started"));
190 	efx_ev_qpost(evq->common, magic);
191 }
192 
193 static void
194 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
195 {
196 	/* Initially retry after 100 ms, but back off in case of
197 	 * repeated failures as we probably have to wait for the
198 	 * administrator to raise the pool limit. */
199 	if (retrying)
200 		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
201 	else
202 		rxq->refill_delay = hz / 10;
203 
204 	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
205 			     sfxge_rx_post_refill, rxq);
206 }
207 
208 static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
209 {
210 	struct mb_args args;
211 	struct mbuf *m;
212 
213 	/* Allocate mbuf structure */
214 	args.flags = M_PKTHDR;
215 	args.type = MT_DATA;
216 	m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
217 
218 	/* Allocate (and attach) packet buffer */
219 	if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
220 		uma_zfree(zone_mbuf, m);
221 		m = NULL;
222 	}
223 
224 	return (m);
225 }
226 
227 #define	SFXGE_REFILL_BATCH  64
228 
229 static void
230 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
231 {
232 	struct sfxge_softc *sc;
233 	unsigned int index;
234 	struct sfxge_evq *evq;
235 	unsigned int batch;
236 	unsigned int rxfill;
237 	unsigned int mblksize;
238 	int ntodo;
239 	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
240 
241 	sc = rxq->sc;
242 	index = rxq->index;
243 	evq = sc->evq[index];
244 
245 	prefetch_read_many(sc->enp);
246 	prefetch_read_many(rxq->common);
247 
248 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
249 
250 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
251 		return;
252 
253 	rxfill = rxq->added - rxq->completed;
254 	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
255 	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
256 	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
257 	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
258 	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
259 
260 	if (ntodo == 0)
261 		return;
262 
263 	batch = 0;
264 	mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
265 	while (ntodo-- > 0) {
266 		unsigned int id;
267 		struct sfxge_rx_sw_desc *rx_desc;
268 		bus_dma_segment_t seg;
269 		struct mbuf *m;
270 
271 		id = (rxq->added + batch) & rxq->ptr_mask;
272 		rx_desc = &rxq->queue[id];
273 		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
274 
275 		rx_desc->flags = EFX_DISCARD;
276 		m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
277 		if (m == NULL)
278 			break;
279 
280 		/* m_len specifies length of area to be mapped for DMA */
281 		m->m_len  = mblksize;
282 		m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
283 		m->m_data += sc->rx_buffer_align;
284 
285 		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
286 		addr[batch++] = seg.ds_addr;
287 
288 		if (batch == SFXGE_REFILL_BATCH) {
289 			efx_rx_qpost(rxq->common, addr, mblksize, batch,
290 			    rxq->completed, rxq->added);
291 			rxq->added += batch;
292 			batch = 0;
293 		}
294 	}
295 
296 	if (ntodo != 0)
297 		sfxge_rx_schedule_refill(rxq, retrying);
298 
299 	if (batch != 0) {
300 		efx_rx_qpost(rxq->common, addr, mblksize, batch,
301 		    rxq->completed, rxq->added);
302 		rxq->added += batch;
303 	}
304 
305 	/* Make the descriptors visible to the hardware */
306 	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
307 			BUS_DMASYNC_PREWRITE);
308 
309 	efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
310 
311 	/* The queue could still be empty if no descriptors were actually
312 	 * pushed, in which case there will be no event to cause the next
313 	 * refill, so we must schedule a refill ourselves.
314 	 */
315 	if(rxq->pushed == rxq->completed) {
316 		sfxge_rx_schedule_refill(rxq, retrying);
317 	}
318 }
319 
320 void
321 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
322 {
323 
324 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
325 		return;
326 
327 	/* Make sure the queue is full */
328 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
329 }
330 
331 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
332 {
333 	struct ifnet *ifp = sc->ifnet;
334 
335 	m->m_pkthdr.rcvif = ifp;
336 	m->m_pkthdr.csum_data = 0xffff;
337 	ifp->if_input(ifp, m);
338 }
339 
340 static void
341 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
342 {
343 	struct mbuf *m = rx_desc->mbuf;
344 	int flags = rx_desc->flags;
345 	int csum_flags;
346 
347 	/* Convert checksum flags */
348 	csum_flags = (flags & EFX_CKSUM_IPV4) ?
349 		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
350 	if (flags & EFX_CKSUM_TCPUDP)
351 		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
352 
353 	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
354 		m->m_pkthdr.flowid =
355 			efx_psuedo_hdr_hash_get(sc->enp,
356 						EFX_RX_HASHALG_TOEPLITZ,
357 						mtod(m, uint8_t *));
358 		/* The hash covers a 4-tuple for TCP only */
359 		M_HASHTYPE_SET(m,
360 		    (flags & EFX_PKT_IPV4) ?
361 			((flags & EFX_PKT_TCP) ?
362 			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
363 			((flags & EFX_PKT_TCP) ?
364 			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
365 	}
366 	m->m_data += sc->rx_prefix_size;
367 	m->m_len = rx_desc->size - sc->rx_prefix_size;
368 	m->m_pkthdr.len = m->m_len;
369 	m->m_pkthdr.csum_flags = csum_flags;
370 	__sfxge_rx_deliver(sc, rx_desc->mbuf);
371 
372 	rx_desc->flags = EFX_DISCARD;
373 	rx_desc->mbuf = NULL;
374 }
375 
376 #ifdef SFXGE_LRO
377 
378 static void
379 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
380 {
381 	struct sfxge_softc *sc = st->sc;
382 	struct mbuf *m = c->mbuf;
383 	struct tcphdr *c_th;
384 	int csum_flags;
385 
386 	KASSERT(m, ("no mbuf to deliver"));
387 
388 	++st->n_bursts;
389 
390 	/* Finish off packet munging and recalculate IP header checksum. */
391 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
392 		struct ip *iph = c->nh;
393 		iph->ip_len = htons(iph->ip_len);
394 		iph->ip_sum = 0;
395 		iph->ip_sum = in_cksum_hdr(iph);
396 		c_th = (struct tcphdr *)(iph + 1);
397 		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
398 			      CSUM_IP_CHECKED | CSUM_IP_VALID);
399 	} else {
400 		struct ip6_hdr *iph = c->nh;
401 		iph->ip6_plen = htons(iph->ip6_plen);
402 		c_th = (struct tcphdr *)(iph + 1);
403 		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
404 	}
405 
406 	c_th->th_win = c->th_last->th_win;
407 	c_th->th_ack = c->th_last->th_ack;
408 	if (c_th->th_off == c->th_last->th_off) {
409 		/* Copy TCP options (take care to avoid going negative). */
410 		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
411 		memcpy(c_th + 1, c->th_last + 1, optlen);
412 	}
413 
414 	m->m_pkthdr.flowid = c->conn_hash;
415 	M_HASHTYPE_SET(m,
416 	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
417 		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
418 
419 	m->m_pkthdr.csum_flags = csum_flags;
420 	__sfxge_rx_deliver(sc, m);
421 
422 	c->mbuf = NULL;
423 	c->delivered = 1;
424 }
425 
426 /* Drop the given connection, and add it to the free list. */
427 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
428 {
429 	unsigned bucket;
430 
431 	KASSERT(!c->mbuf, ("found orphaned mbuf"));
432 
433 	if (c->next_buf.mbuf != NULL) {
434 		sfxge_rx_deliver(rxq->sc, &c->next_buf);
435 		LIST_REMOVE(c, active_link);
436 	}
437 
438 	bucket = c->conn_hash & rxq->lro.conns_mask;
439 	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
440 	--rxq->lro.conns_n[bucket];
441 	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
442 	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
443 }
444 
445 /* Stop tracking connections that have gone idle in order to keep hash
446  * chains short.
447  */
448 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
449 {
450 	struct sfxge_lro_conn *c;
451 	unsigned i;
452 
453 	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
454 		("found active connections"));
455 
456 	rxq->lro.last_purge_ticks = now;
457 	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
458 		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
459 			continue;
460 
461 		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
462 		if (now - c->last_pkt_ticks > lro_idle_ticks) {
463 			++rxq->lro.n_drop_idle;
464 			sfxge_lro_drop(rxq, c);
465 		}
466 	}
467 }
468 
469 static void
470 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
471 		struct mbuf *mbuf, struct tcphdr *th)
472 {
473 	struct tcphdr *c_th;
474 
475 	/* Tack the new mbuf onto the chain. */
476 	KASSERT(!mbuf->m_next, ("mbuf already chained"));
477 	c->mbuf_tail->m_next = mbuf;
478 	c->mbuf_tail = mbuf;
479 
480 	/* Increase length appropriately */
481 	c->mbuf->m_pkthdr.len += mbuf->m_len;
482 
483 	/* Update the connection state flags */
484 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
485 		struct ip *iph = c->nh;
486 		iph->ip_len += mbuf->m_len;
487 		c_th = (struct tcphdr *)(iph + 1);
488 	} else {
489 		struct ip6_hdr *iph = c->nh;
490 		iph->ip6_plen += mbuf->m_len;
491 		c_th = (struct tcphdr *)(iph + 1);
492 	}
493 	c_th->th_flags |= (th->th_flags & TH_PUSH);
494 	c->th_last = th;
495 	++st->n_merges;
496 
497 	/* Pass packet up now if another segment could overflow the IP
498 	 * length.
499 	 */
500 	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
501 		sfxge_lro_deliver(st, c);
502 }
503 
504 static void
505 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
506 		struct mbuf *mbuf, void *nh, struct tcphdr *th)
507 {
508 	/* Start the chain */
509 	c->mbuf = mbuf;
510 	c->mbuf_tail = c->mbuf;
511 	c->nh = nh;
512 	c->th_last = th;
513 
514 	mbuf->m_pkthdr.len = mbuf->m_len;
515 
516 	/* Mangle header fields for later processing */
517 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
518 		struct ip *iph = nh;
519 		iph->ip_len = ntohs(iph->ip_len);
520 	} else {
521 		struct ip6_hdr *iph = nh;
522 		iph->ip6_plen = ntohs(iph->ip6_plen);
523 	}
524 }
525 
526 /* Try to merge or otherwise hold or deliver (as appropriate) the
527  * packet buffered for this connection (c->next_buf).  Return a flag
528  * indicating whether the connection is still active for LRO purposes.
529  */
530 static int
531 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
532 {
533 	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
534 	char *eh = c->next_eh;
535 	int data_length, hdr_length, dont_merge;
536 	unsigned th_seq, pkt_length;
537 	struct tcphdr *th;
538 	unsigned now;
539 
540 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
541 		struct ip *iph = c->next_nh;
542 		th = (struct tcphdr *)(iph + 1);
543 		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
544 	} else {
545 		struct ip6_hdr *iph = c->next_nh;
546 		th = (struct tcphdr *)(iph + 1);
547 		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
548 	}
549 
550 	hdr_length = (char *) th + th->th_off * 4 - eh;
551 	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
552 		       hdr_length);
553 	th_seq = ntohl(th->th_seq);
554 	dont_merge = ((data_length <= 0)
555 		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
556 
557 	/* Check for options other than aligned timestamp. */
558 	if (th->th_off != 5) {
559 		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
560 		if (th->th_off == 8 &&
561 		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
562 					(TCPOPT_NOP << 16) |
563 					(TCPOPT_TIMESTAMP << 8) |
564 					TCPOLEN_TIMESTAMP)) {
565 			/* timestamp option -- okay */
566 		} else {
567 			dont_merge = 1;
568 		}
569 	}
570 
571 	if (__predict_false(th_seq != c->next_seq)) {
572 		/* Out-of-order, so start counting again. */
573 		if (c->mbuf != NULL)
574 			sfxge_lro_deliver(&rxq->lro, c);
575 		c->n_in_order_pkts -= lro_loss_packets;
576 		c->next_seq = th_seq + data_length;
577 		++rxq->lro.n_misorder;
578 		goto deliver_buf_out;
579 	}
580 	c->next_seq = th_seq + data_length;
581 
582 	now = ticks;
583 	if (now - c->last_pkt_ticks > lro_idle_ticks) {
584 		++rxq->lro.n_drop_idle;
585 		if (c->mbuf != NULL)
586 			sfxge_lro_deliver(&rxq->lro, c);
587 		sfxge_lro_drop(rxq, c);
588 		return (0);
589 	}
590 	c->last_pkt_ticks = ticks;
591 
592 	if (c->n_in_order_pkts < lro_slow_start_packets) {
593 		/* May be in slow-start, so don't merge. */
594 		++rxq->lro.n_slow_start;
595 		++c->n_in_order_pkts;
596 		goto deliver_buf_out;
597 	}
598 
599 	if (__predict_false(dont_merge)) {
600 		if (c->mbuf != NULL)
601 			sfxge_lro_deliver(&rxq->lro, c);
602 		if (th->th_flags & (TH_FIN | TH_RST)) {
603 			++rxq->lro.n_drop_closed;
604 			sfxge_lro_drop(rxq, c);
605 			return (0);
606 		}
607 		goto deliver_buf_out;
608 	}
609 
610 	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
611 
612 	if (__predict_true(c->mbuf != NULL)) {
613 		/* Remove headers and any padding */
614 		rx_buf->mbuf->m_data += hdr_length;
615 		rx_buf->mbuf->m_len = data_length;
616 
617 		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
618 	} else {
619 		/* Remove any padding */
620 		rx_buf->mbuf->m_len = pkt_length;
621 
622 		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
623 	}
624 
625 	rx_buf->mbuf = NULL;
626 	return (1);
627 
628  deliver_buf_out:
629 	sfxge_rx_deliver(rxq->sc, rx_buf);
630 	return (1);
631 }
632 
633 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
634 			       uint16_t l2_id, void *nh, struct tcphdr *th)
635 {
636 	unsigned bucket = conn_hash & st->conns_mask;
637 	struct sfxge_lro_conn *c;
638 
639 	if (st->conns_n[bucket] >= lro_chain_max) {
640 		++st->n_too_many;
641 		return;
642 	}
643 
644 	if (!TAILQ_EMPTY(&st->free_conns)) {
645 		c = TAILQ_FIRST(&st->free_conns);
646 		TAILQ_REMOVE(&st->free_conns, c, link);
647 	} else {
648 		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
649 		if (c == NULL)
650 			return;
651 		c->mbuf = NULL;
652 		c->next_buf.mbuf = NULL;
653 	}
654 
655 	/* Create the connection tracking data */
656 	++st->conns_n[bucket];
657 	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
658 	c->l2_id = l2_id;
659 	c->conn_hash = conn_hash;
660 	c->source = th->th_sport;
661 	c->dest = th->th_dport;
662 	c->n_in_order_pkts = 0;
663 	c->last_pkt_ticks = *(volatile int *)&ticks;
664 	c->delivered = 0;
665 	++st->n_new_stream;
666 	/* NB. We don't initialise c->next_seq, and it doesn't matter what
667 	 * value it has.  Most likely the next packet received for this
668 	 * connection will not match -- no harm done.
669 	 */
670 }
671 
672 /* Process mbuf and decide whether to dispatch it to the stack now or
673  * later.
674  */
675 static void
676 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
677 {
678 	struct sfxge_softc *sc = rxq->sc;
679 	struct mbuf *m = rx_buf->mbuf;
680 	struct ether_header *eh;
681 	struct sfxge_lro_conn *c;
682 	uint16_t l2_id;
683 	uint16_t l3_proto;
684 	void *nh;
685 	struct tcphdr *th;
686 	uint32_t conn_hash;
687 	unsigned bucket;
688 
689 	/* Get the hardware hash */
690 	conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
691 					    EFX_RX_HASHALG_TOEPLITZ,
692 					    mtod(m, uint8_t *));
693 
694 	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
695 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
696 		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
697 		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
698 			SFXGE_LRO_L2_ID_VLAN;
699 		l3_proto = veh->evl_proto;
700 		nh = veh + 1;
701 	} else {
702 		l2_id = 0;
703 		l3_proto = eh->ether_type;
704 		nh = eh + 1;
705 	}
706 
707 	/* Check whether this is a suitable packet (unfragmented
708 	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
709 	 * length, and compute a hash if necessary.  If not, return.
710 	 */
711 	if (l3_proto == htons(ETHERTYPE_IP)) {
712 		struct ip *iph = nh;
713 
714 		KASSERT(iph->ip_p == IPPROTO_TCP,
715 		    ("IPv4 protocol is not TCP, but packet marker is set"));
716 		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
717 		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
718 			goto deliver_now;
719 		th = (struct tcphdr *)(iph + 1);
720 	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
721 		struct ip6_hdr *iph = nh;
722 
723 		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
724 		    ("IPv6 next header is not TCP, but packet marker is set"));
725 		l2_id |= SFXGE_LRO_L2_ID_IPV6;
726 		th = (struct tcphdr *)(iph + 1);
727 	} else {
728 		goto deliver_now;
729 	}
730 
731 	bucket = conn_hash & rxq->lro.conns_mask;
732 
733 	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
734 		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
735 			continue;
736 		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
737 			continue;
738 		if (c->mbuf != NULL) {
739 			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
740 				struct ip *c_iph, *iph = nh;
741 				c_iph = c->nh;
742 				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
743 				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
744 					continue;
745 			} else {
746 				struct ip6_hdr *c_iph, *iph = nh;
747 				c_iph = c->nh;
748 				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
749 				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
750 					continue;
751 			}
752 		}
753 
754 		/* Re-insert at head of list to reduce lookup time. */
755 		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
756 		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
757 
758 		if (c->next_buf.mbuf != NULL) {
759 			if (!sfxge_lro_try_merge(rxq, c))
760 				goto deliver_now;
761 		} else {
762 			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
763 			    active_link);
764 		}
765 		c->next_buf = *rx_buf;
766 		c->next_eh = eh;
767 		c->next_nh = nh;
768 
769 		rx_buf->mbuf = NULL;
770 		rx_buf->flags = EFX_DISCARD;
771 		return;
772 	}
773 
774 	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
775  deliver_now:
776 	sfxge_rx_deliver(sc, rx_buf);
777 }
778 
779 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
780 {
781 	struct sfxge_lro_state *st = &rxq->lro;
782 	struct sfxge_lro_conn *c;
783 	unsigned t;
784 
785 	while (!LIST_EMPTY(&st->active_conns)) {
786 		c = LIST_FIRST(&st->active_conns);
787 		if (!c->delivered && c->mbuf != NULL)
788 			sfxge_lro_deliver(st, c);
789 		if (sfxge_lro_try_merge(rxq, c)) {
790 			if (c->mbuf != NULL)
791 				sfxge_lro_deliver(st, c);
792 			LIST_REMOVE(c, active_link);
793 		}
794 		c->delivered = 0;
795 	}
796 
797 	t = *(volatile int *)&ticks;
798 	if (__predict_false(t != st->last_purge_ticks))
799 		sfxge_lro_purge_idle(rxq, t);
800 }
801 
802 #else	/* !SFXGE_LRO */
803 
804 static void
805 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
806 {
807 }
808 
809 static void
810 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
811 {
812 }
813 
814 #endif	/* SFXGE_LRO */
815 
816 void
817 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
818 {
819 	struct sfxge_softc *sc = rxq->sc;
820 	int if_capenable = sc->ifnet->if_capenable;
821 	int lro_enabled = if_capenable & IFCAP_LRO;
822 	unsigned int index;
823 	struct sfxge_evq *evq;
824 	unsigned int completed;
825 	unsigned int level;
826 	struct mbuf *m;
827 	struct sfxge_rx_sw_desc *prev = NULL;
828 
829 	index = rxq->index;
830 	evq = sc->evq[index];
831 
832 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
833 
834 	completed = rxq->completed;
835 	while (completed != rxq->pending) {
836 		unsigned int id;
837 		struct sfxge_rx_sw_desc *rx_desc;
838 
839 		id = completed++ & rxq->ptr_mask;
840 		rx_desc = &rxq->queue[id];
841 		m = rx_desc->mbuf;
842 
843 		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
844 			goto discard;
845 
846 		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
847 			goto discard;
848 
849 		/* Read the length from the psuedo header if required */
850 		if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
851 			uint16_t tmp_size;
852 			int rc;
853 			rc = efx_psuedo_hdr_pkt_length_get(sc->enp,
854 							   mtod(m, uint8_t *),
855 							   &tmp_size);
856 			KASSERT(rc == 0, ("cannot get packet length: %d", rc));
857 			rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
858 		}
859 
860 		prefetch_read_many(mtod(m, caddr_t));
861 
862 		switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
863 		case EFX_PKT_IPV4:
864 			if (~if_capenable & IFCAP_RXCSUM)
865 				rx_desc->flags &=
866 				    ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
867 			break;
868 		case EFX_PKT_IPV6:
869 			if (~if_capenable & IFCAP_RXCSUM_IPV6)
870 				rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
871 			break;
872 		case 0:
873 			/* Check for loopback packets */
874 			{
875 				struct ether_header *etherhp;
876 
877 				/*LINTED*/
878 				etherhp = mtod(m, struct ether_header *);
879 
880 				if (etherhp->ether_type ==
881 				    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
882 					EFSYS_PROBE(loopback);
883 
884 					rxq->loopback++;
885 					goto discard;
886 				}
887 			}
888 			break;
889 		default:
890 			KASSERT(B_FALSE,
891 			    ("Rx descriptor with both IPv4 and IPv6 flags"));
892 			goto discard;
893 		}
894 
895 		/* Pass packet up the stack or into LRO (pipelined) */
896 		if (prev != NULL) {
897 			if (lro_enabled &&
898 			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
899 			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
900 				sfxge_lro(rxq, prev);
901 			else
902 				sfxge_rx_deliver(sc, prev);
903 		}
904 		prev = rx_desc;
905 		continue;
906 
907 discard:
908 		/* Return the packet to the pool */
909 		m_free(m);
910 		rx_desc->mbuf = NULL;
911 	}
912 	rxq->completed = completed;
913 
914 	level = rxq->added - rxq->completed;
915 
916 	/* Pass last packet up the stack or into LRO */
917 	if (prev != NULL) {
918 		if (lro_enabled &&
919 		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
920 		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
921 			sfxge_lro(rxq, prev);
922 		else
923 			sfxge_rx_deliver(sc, prev);
924 	}
925 
926 	/*
927 	 * If there are any pending flows and this is the end of the
928 	 * poll then they must be completed.
929 	 */
930 	if (eop)
931 		sfxge_lro_end_of_burst(rxq);
932 
933 	/* Top up the queue if necessary */
934 	if (level < rxq->refill_threshold)
935 		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
936 }
937 
938 static void
939 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
940 {
941 	struct sfxge_rxq *rxq;
942 	struct sfxge_evq *evq;
943 	unsigned int count;
944 	unsigned int retry = 3;
945 
946 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
947 
948 	rxq = sc->rxq[index];
949 	evq = sc->evq[index];
950 
951 	SFXGE_EVQ_LOCK(evq);
952 
953 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
954 	    ("rxq not started"));
955 
956 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
957 
958 	callout_stop(&rxq->refill_callout);
959 
960 	while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
961 		rxq->flush_state = SFXGE_FLUSH_PENDING;
962 
963 		SFXGE_EVQ_UNLOCK(evq);
964 
965 		/* Flush the receive queue */
966 		if (efx_rx_qflush(rxq->common) != 0) {
967 			SFXGE_EVQ_LOCK(evq);
968 			rxq->flush_state = SFXGE_FLUSH_FAILED;
969 			break;
970 		}
971 
972 		count = 0;
973 		do {
974 			/* Spin for 100 ms */
975 			DELAY(100000);
976 
977 			if (rxq->flush_state != SFXGE_FLUSH_PENDING)
978 				break;
979 
980 		} while (++count < 20);
981 
982 		SFXGE_EVQ_LOCK(evq);
983 
984 		if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
985 			/* Flush timeout - neither done nor failed */
986 			log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
987 			    device_get_nameunit(sc->dev), index);
988 			rxq->flush_state = SFXGE_FLUSH_DONE;
989 		}
990 		retry--;
991 	}
992 	if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
993 		log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
994 		    device_get_nameunit(sc->dev), index);
995 		rxq->flush_state = SFXGE_FLUSH_DONE;
996 	}
997 
998 	rxq->pending = rxq->added;
999 	sfxge_rx_qcomplete(rxq, B_TRUE);
1000 
1001 	KASSERT(rxq->completed == rxq->pending,
1002 	    ("rxq->completed != rxq->pending"));
1003 
1004 	rxq->added = 0;
1005 	rxq->pushed = 0;
1006 	rxq->pending = 0;
1007 	rxq->completed = 0;
1008 	rxq->loopback = 0;
1009 
1010 	/* Destroy the common code receive queue. */
1011 	efx_rx_qdestroy(rxq->common);
1012 
1013 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1014 	    EFX_RXQ_NBUFS(sc->rxq_entries));
1015 
1016 	SFXGE_EVQ_UNLOCK(evq);
1017 }
1018 
1019 static int
1020 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1021 {
1022 	struct sfxge_rxq *rxq;
1023 	efsys_mem_t *esmp;
1024 	struct sfxge_evq *evq;
1025 	int rc;
1026 
1027 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1028 
1029 	rxq = sc->rxq[index];
1030 	esmp = &rxq->mem;
1031 	evq = sc->evq[index];
1032 
1033 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1034 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1035 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1036 	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1037 
1038 	/* Program the buffer table. */
1039 	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1040 	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1041 		return (rc);
1042 
1043 	/* Create the common code receive queue. */
1044 	if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
1045 	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1046 	    &rxq->common)) != 0)
1047 		goto fail;
1048 
1049 	SFXGE_EVQ_LOCK(evq);
1050 
1051 	/* Enable the receive queue. */
1052 	efx_rx_qenable(rxq->common);
1053 
1054 	rxq->init_state = SFXGE_RXQ_STARTED;
1055 	rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1056 
1057 	/* Try to fill the queue from the pool. */
1058 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1059 
1060 	SFXGE_EVQ_UNLOCK(evq);
1061 
1062 	return (0);
1063 
1064 fail:
1065 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1066 	    EFX_RXQ_NBUFS(sc->rxq_entries));
1067 	return (rc);
1068 }
1069 
1070 void
1071 sfxge_rx_stop(struct sfxge_softc *sc)
1072 {
1073 	int index;
1074 
1075 	efx_mac_filter_default_rxq_clear(sc->enp);
1076 
1077 	/* Stop the receive queue(s) */
1078 	index = sc->rxq_count;
1079 	while (--index >= 0)
1080 		sfxge_rx_qstop(sc, index);
1081 
1082 	sc->rx_prefix_size = 0;
1083 	sc->rx_buffer_size = 0;
1084 
1085 	efx_rx_fini(sc->enp);
1086 }
1087 
1088 int
1089 sfxge_rx_start(struct sfxge_softc *sc)
1090 {
1091 	struct sfxge_intr *intr;
1092 	const efx_nic_cfg_t *encp;
1093 	size_t hdrlen, align, reserved;
1094 	int index;
1095 	int rc;
1096 
1097 	intr = &sc->intr;
1098 
1099 	/* Initialize the common code receive module. */
1100 	if ((rc = efx_rx_init(sc->enp)) != 0)
1101 		return (rc);
1102 
1103 	encp = efx_nic_cfg_get(sc->enp);
1104 	sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1105 
1106 	/* Calculate the receive packet buffer size. */
1107 	sc->rx_prefix_size = encp->enc_rx_prefix_size;
1108 
1109 	/* Ensure IP headers are 32bit aligned */
1110 	hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1111 	sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1112 
1113 	sc->rx_buffer_size += sc->rx_buffer_align;
1114 
1115 	/* Align end of packet buffer for RX DMA end padding */
1116 	align = MAX(1, encp->enc_rx_buf_align_end);
1117 	EFSYS_ASSERT(ISP2(align));
1118 	sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1119 
1120 	/*
1121 	 * Standard mbuf zones only guarantee pointer-size alignment;
1122 	 * we need extra space to align to the cache line
1123 	 */
1124 	reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1125 
1126 	/* Select zone for packet buffers */
1127 	if (reserved <= MCLBYTES)
1128 		sc->rx_buffer_zone = zone_clust;
1129 	else if (reserved <= MJUMPAGESIZE)
1130 		sc->rx_buffer_zone = zone_jumbop;
1131 	else if (reserved <= MJUM9BYTES)
1132 		sc->rx_buffer_zone = zone_jumbo9;
1133 	else
1134 		sc->rx_buffer_zone = zone_jumbo16;
1135 
1136 	/*
1137 	 * Set up the scale table.  Enable all hash types and hash insertion.
1138 	 */
1139 	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1140 		sc->rx_indir_table[index] = index % sc->rxq_count;
1141 	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1142 				       SFXGE_RX_SCALE_MAX)) != 0)
1143 		goto fail;
1144 	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1145 	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1146 	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1147 
1148 	if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1149 				       sizeof(toep_key))) != 0)
1150 		goto fail;
1151 
1152 	/* Start the receive queue(s). */
1153 	for (index = 0; index < sc->rxq_count; index++) {
1154 		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1155 			goto fail2;
1156 	}
1157 
1158 	rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1159 					    sc->intr.n_alloc > 1);
1160 	if (rc != 0)
1161 		goto fail3;
1162 
1163 	return (0);
1164 
1165 fail3:
1166 fail2:
1167 	while (--index >= 0)
1168 		sfxge_rx_qstop(sc, index);
1169 
1170 fail:
1171 	efx_rx_fini(sc->enp);
1172 
1173 	return (rc);
1174 }
1175 
1176 #ifdef SFXGE_LRO
1177 
1178 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1179 {
1180 	struct sfxge_lro_state *st = &rxq->lro;
1181 	unsigned i;
1182 
1183 	st->conns_mask = lro_table_size - 1;
1184 	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1185 		("lro_table_size must be a power of 2"));
1186 	st->sc = rxq->sc;
1187 	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1188 			   M_SFXGE, M_WAITOK);
1189 	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1190 			     M_SFXGE, M_WAITOK);
1191 	for (i = 0; i <= st->conns_mask; ++i) {
1192 		TAILQ_INIT(&st->conns[i]);
1193 		st->conns_n[i] = 0;
1194 	}
1195 	LIST_INIT(&st->active_conns);
1196 	TAILQ_INIT(&st->free_conns);
1197 }
1198 
1199 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1200 {
1201 	struct sfxge_lro_state *st = &rxq->lro;
1202 	struct sfxge_lro_conn *c;
1203 	unsigned i;
1204 
1205 	/* Return cleanly if sfxge_lro_init() has not been called. */
1206 	if (st->conns == NULL)
1207 		return;
1208 
1209 	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1210 
1211 	for (i = 0; i <= st->conns_mask; ++i) {
1212 		while (!TAILQ_EMPTY(&st->conns[i])) {
1213 			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1214 			sfxge_lro_drop(rxq, c);
1215 		}
1216 	}
1217 
1218 	while (!TAILQ_EMPTY(&st->free_conns)) {
1219 		c = TAILQ_FIRST(&st->free_conns);
1220 		TAILQ_REMOVE(&st->free_conns, c, link);
1221 		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1222 		free(c, M_SFXGE);
1223 	}
1224 
1225 	free(st->conns_n, M_SFXGE);
1226 	free(st->conns, M_SFXGE);
1227 	st->conns = NULL;
1228 }
1229 
1230 #else
1231 
1232 static void
1233 sfxge_lro_init(struct sfxge_rxq *rxq)
1234 {
1235 }
1236 
1237 static void
1238 sfxge_lro_fini(struct sfxge_rxq *rxq)
1239 {
1240 }
1241 
1242 #endif	/* SFXGE_LRO */
1243 
1244 static void
1245 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1246 {
1247 	struct sfxge_rxq *rxq;
1248 
1249 	rxq = sc->rxq[index];
1250 
1251 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1252 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1253 
1254 	/* Free the context array and the flow table. */
1255 	free(rxq->queue, M_SFXGE);
1256 	sfxge_lro_fini(rxq);
1257 
1258 	/* Release DMA memory. */
1259 	sfxge_dma_free(&rxq->mem);
1260 
1261 	sc->rxq[index] = NULL;
1262 
1263 	free(rxq, M_SFXGE);
1264 }
1265 
1266 static int
1267 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1268 {
1269 	struct sfxge_rxq *rxq;
1270 	struct sfxge_evq *evq;
1271 	efsys_mem_t *esmp;
1272 	int rc;
1273 
1274 	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1275 
1276 	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1277 	rxq->sc = sc;
1278 	rxq->index = index;
1279 	rxq->entries = sc->rxq_entries;
1280 	rxq->ptr_mask = rxq->entries - 1;
1281 	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1282 
1283 	sc->rxq[index] = rxq;
1284 	esmp = &rxq->mem;
1285 
1286 	evq = sc->evq[index];
1287 
1288 	/* Allocate and zero DMA space. */
1289 	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1290 		return (rc);
1291 
1292 	/* Allocate buffer table entries. */
1293 	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1294 				 &rxq->buf_base_id);
1295 
1296 	/* Allocate the context array and the flow table. */
1297 	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1298 	    M_SFXGE, M_WAITOK | M_ZERO);
1299 	sfxge_lro_init(rxq);
1300 
1301 	callout_init(&rxq->refill_callout, 1);
1302 
1303 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1304 
1305 	return (0);
1306 }
1307 
1308 static const struct {
1309 	const char *name;
1310 	size_t offset;
1311 } sfxge_rx_stats[] = {
1312 #define	SFXGE_RX_STAT(name, member) \
1313 	{ #name, offsetof(struct sfxge_rxq, member) }
1314 #ifdef SFXGE_LRO
1315 	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1316 	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1317 	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1318 	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1319 	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1320 	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1321 	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1322 	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1323 #endif
1324 };
1325 
1326 static int
1327 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1328 {
1329 	struct sfxge_softc *sc = arg1;
1330 	unsigned int id = arg2;
1331 	unsigned int sum, index;
1332 
1333 	/* Sum across all RX queues */
1334 	sum = 0;
1335 	for (index = 0; index < sc->rxq_count; index++)
1336 		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1337 					 sfxge_rx_stats[id].offset);
1338 
1339 	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1340 }
1341 
1342 static void
1343 sfxge_rx_stat_init(struct sfxge_softc *sc)
1344 {
1345 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1346 	struct sysctl_oid_list *stat_list;
1347 	unsigned int id;
1348 
1349 	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1350 
1351 	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1352 		SYSCTL_ADD_PROC(
1353 			ctx, stat_list,
1354 			OID_AUTO, sfxge_rx_stats[id].name,
1355 			CTLTYPE_UINT|CTLFLAG_RD,
1356 			sc, id, sfxge_rx_stat_handler, "IU",
1357 			"");
1358 	}
1359 }
1360 
1361 void
1362 sfxge_rx_fini(struct sfxge_softc *sc)
1363 {
1364 	int index;
1365 
1366 	index = sc->rxq_count;
1367 	while (--index >= 0)
1368 		sfxge_rx_qfini(sc, index);
1369 
1370 	sc->rxq_count = 0;
1371 }
1372 
1373 int
1374 sfxge_rx_init(struct sfxge_softc *sc)
1375 {
1376 	struct sfxge_intr *intr;
1377 	int index;
1378 	int rc;
1379 
1380 #ifdef SFXGE_LRO
1381 	if (!ISP2(lro_table_size)) {
1382 		log(LOG_ERR, "%s=%u must be power of 2",
1383 		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1384 		rc = EINVAL;
1385 		goto fail_lro_table_size;
1386 	}
1387 
1388 	if (lro_idle_ticks == 0)
1389 		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1390 #endif
1391 
1392 	intr = &sc->intr;
1393 
1394 	sc->rxq_count = intr->n_alloc;
1395 
1396 	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1397 	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1398 
1399 	/* Initialize the receive queue(s) - one per interrupt. */
1400 	for (index = 0; index < sc->rxq_count; index++) {
1401 		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1402 			goto fail;
1403 	}
1404 
1405 	sfxge_rx_stat_init(sc);
1406 
1407 	return (0);
1408 
1409 fail:
1410 	/* Tear down the receive queue(s). */
1411 	while (--index >= 0)
1412 		sfxge_rx_qfini(sc, index);
1413 
1414 	sc->rxq_count = 0;
1415 
1416 #ifdef SFXGE_LRO
1417 fail_lro_table_size:
1418 #endif
1419 	return (rc);
1420 }
1421