xref: /freebsd/sys/dev/sfxge/sfxge_rx.c (revision eb9da1ada8b6b2c74378a5c17029ec5a7fb199e6)
1 /*-
2  * Copyright (c) 2010-2016 Solarflare Communications Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright notice,
12  *    this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  *    this list of conditions and the following disclaimer in the documentation
15  *    and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  * The views and conclusions contained in the software and documentation are
30  * those of the authors and should not be interpreted as representing official
31  * policies, either expressed or implied, of the FreeBSD Project.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include "opt_rss.h"
38 
39 #include <sys/param.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/smp.h>
43 #include <sys/socket.h>
44 #include <sys/sysctl.h>
45 #include <sys/syslog.h>
46 #include <sys/limits.h>
47 #include <sys/syslog.h>
48 
49 #include <net/ethernet.h>
50 #include <net/if.h>
51 #include <net/if_vlan_var.h>
52 
53 #include <netinet/in.h>
54 #include <netinet/ip.h>
55 #include <netinet/ip6.h>
56 #include <netinet/tcp.h>
57 
58 #include <machine/in_cksum.h>
59 
60 #ifdef RSS
61 #include <net/rss_config.h>
62 #endif
63 
64 #include "common/efx.h"
65 
66 
67 #include "sfxge.h"
68 #include "sfxge_rx.h"
69 
70 #define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
71 
72 #ifdef SFXGE_LRO
73 
74 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
75 	    "Large receive offload (LRO) parameters");
76 
77 #define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
78 
79 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
80  * means we can accelerate a larger number of streams.
81  */
82 static unsigned lro_table_size = 128;
83 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
84 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
85 	    &lro_table_size, 0,
86 	    "Size of the LRO hash table (must be a power of 2)");
87 
88 /* Maximum length of a hash chain.  If chains get too long then the lookup
89  * time increases and may exceed the benefit of LRO.
90  */
91 static unsigned lro_chain_max = 20;
92 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
93 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
94 	    &lro_chain_max, 0,
95 	    "The maximum length of a hash chain");
96 
97 /* Maximum time (in ticks) that a connection can be idle before it's LRO
98  * state is discarded.
99  */
100 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
101 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
102 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
103 	    &lro_idle_ticks, 0,
104 	    "The maximum time (in ticks) that a connection can be idle "
105 	    "before it's LRO state is discarded");
106 
107 /* Number of packets with payload that must arrive in-order before a
108  * connection is eligible for LRO.  The idea is we should avoid coalescing
109  * segments when the sender is in slow-start because reducing the ACK rate
110  * can damage performance.
111  */
112 static int lro_slow_start_packets = 2000;
113 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
114 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
115 	    &lro_slow_start_packets, 0,
116 	    "Number of packets with payload that must arrive in-order before "
117 	    "a connection is eligible for LRO");
118 
119 /* Number of packets with payload that must arrive in-order following loss
120  * before a connection is eligible for LRO.  The idea is we should avoid
121  * coalescing segments when the sender is recovering from loss, because
122  * reducing the ACK rate can damage performance.
123  */
124 static int lro_loss_packets = 20;
125 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
126 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
127 	    &lro_loss_packets, 0,
128 	    "Number of packets with payload that must arrive in-order "
129 	    "following loss before a connection is eligible for LRO");
130 
131 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
132 #define	SFXGE_LRO_L2_ID_VLAN 0x4000
133 #define	SFXGE_LRO_L2_ID_IPV6 0x8000
134 #define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
135 #define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
136 
137 /* Compare IPv6 addresses, avoiding conditional branches */
138 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
139 				   const struct in6_addr *right)
140 {
141 #if LONG_BIT == 64
142 	const uint64_t *left64 = (const uint64_t *)left;
143 	const uint64_t *right64 = (const uint64_t *)right;
144 	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
145 #else
146 	return (left->s6_addr32[0] - right->s6_addr32[0]) |
147 	       (left->s6_addr32[1] - right->s6_addr32[1]) |
148 	       (left->s6_addr32[2] - right->s6_addr32[2]) |
149 	       (left->s6_addr32[3] - right->s6_addr32[3]);
150 #endif
151 }
152 
153 #endif	/* SFXGE_LRO */
154 
155 void
156 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
157 {
158 
159 	rxq->flush_state = SFXGE_FLUSH_DONE;
160 }
161 
162 void
163 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
164 {
165 
166 	rxq->flush_state = SFXGE_FLUSH_FAILED;
167 }
168 
169 #ifdef RSS
170 static uint8_t toep_key[RSS_KEYSIZE];
171 #else
172 static uint8_t toep_key[] = {
173 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
174 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
175 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
176 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
177 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
178 };
179 #endif
180 
181 static void
182 sfxge_rx_post_refill(void *arg)
183 {
184 	struct sfxge_rxq *rxq = arg;
185 	struct sfxge_softc *sc;
186 	unsigned int index;
187 	struct sfxge_evq *evq;
188 	uint16_t magic;
189 
190 	sc = rxq->sc;
191 	index = rxq->index;
192 	evq = sc->evq[index];
193 	magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
194 
195 	/* This is guaranteed due to the start/stop order of rx and ev */
196 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
197 	    ("evq not started"));
198 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
199 	    ("rxq not started"));
200 	efx_ev_qpost(evq->common, magic);
201 }
202 
203 static void
204 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
205 {
206 	/* Initially retry after 100 ms, but back off in case of
207 	 * repeated failures as we probably have to wait for the
208 	 * administrator to raise the pool limit. */
209 	if (retrying)
210 		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
211 	else
212 		rxq->refill_delay = hz / 10;
213 
214 	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
215 			     sfxge_rx_post_refill, rxq);
216 }
217 
218 #define	SFXGE_REFILL_BATCH  64
219 
220 static void
221 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
222 {
223 	struct sfxge_softc *sc;
224 	unsigned int index;
225 	struct sfxge_evq *evq;
226 	unsigned int batch;
227 	unsigned int rxfill;
228 	unsigned int mblksize;
229 	int ntodo;
230 	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
231 
232 	sc = rxq->sc;
233 	index = rxq->index;
234 	evq = sc->evq[index];
235 
236 	prefetch_read_many(sc->enp);
237 	prefetch_read_many(rxq->common);
238 
239 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
240 
241 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
242 		return;
243 
244 	rxfill = rxq->added - rxq->completed;
245 	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
246 	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
247 	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
248 	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
249 	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
250 
251 	if (ntodo == 0)
252 		return;
253 
254 	batch = 0;
255 	mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
256 	while (ntodo-- > 0) {
257 		unsigned int id;
258 		struct sfxge_rx_sw_desc *rx_desc;
259 		bus_dma_segment_t seg;
260 		struct mbuf *m;
261 
262 		id = (rxq->added + batch) & rxq->ptr_mask;
263 		rx_desc = &rxq->queue[id];
264 		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
265 
266 		rx_desc->flags = EFX_DISCARD;
267 		m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
268 		    sc->rx_cluster_size);
269 		if (m == NULL)
270 			break;
271 
272 		/* m_len specifies length of area to be mapped for DMA */
273 		m->m_len  = mblksize;
274 		m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
275 		m->m_data += sc->rx_buffer_align;
276 
277 		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
278 		addr[batch++] = seg.ds_addr;
279 
280 		if (batch == SFXGE_REFILL_BATCH) {
281 			efx_rx_qpost(rxq->common, addr, mblksize, batch,
282 			    rxq->completed, rxq->added);
283 			rxq->added += batch;
284 			batch = 0;
285 		}
286 	}
287 
288 	if (ntodo != 0)
289 		sfxge_rx_schedule_refill(rxq, retrying);
290 
291 	if (batch != 0) {
292 		efx_rx_qpost(rxq->common, addr, mblksize, batch,
293 		    rxq->completed, rxq->added);
294 		rxq->added += batch;
295 	}
296 
297 	/* Make the descriptors visible to the hardware */
298 	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
299 			BUS_DMASYNC_PREWRITE);
300 
301 	efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
302 
303 	/* The queue could still be empty if no descriptors were actually
304 	 * pushed, in which case there will be no event to cause the next
305 	 * refill, so we must schedule a refill ourselves.
306 	 */
307 	if(rxq->pushed == rxq->completed) {
308 		sfxge_rx_schedule_refill(rxq, retrying);
309 	}
310 }
311 
312 void
313 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
314 {
315 
316 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
317 		return;
318 
319 	/* Make sure the queue is full */
320 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
321 }
322 
323 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
324 {
325 	struct ifnet *ifp = sc->ifnet;
326 
327 	m->m_pkthdr.rcvif = ifp;
328 	m->m_pkthdr.csum_data = 0xffff;
329 	ifp->if_input(ifp, m);
330 }
331 
332 static void
333 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
334 {
335 	struct mbuf *m = rx_desc->mbuf;
336 	int flags = rx_desc->flags;
337 	int csum_flags;
338 
339 	/* Convert checksum flags */
340 	csum_flags = (flags & EFX_CKSUM_IPV4) ?
341 		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
342 	if (flags & EFX_CKSUM_TCPUDP)
343 		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
344 
345 	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
346 		m->m_pkthdr.flowid =
347 			efx_psuedo_hdr_hash_get(sc->enp,
348 						EFX_RX_HASHALG_TOEPLITZ,
349 						mtod(m, uint8_t *));
350 		/* The hash covers a 4-tuple for TCP only */
351 		M_HASHTYPE_SET(m,
352 		    (flags & EFX_PKT_IPV4) ?
353 			((flags & EFX_PKT_TCP) ?
354 			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
355 			((flags & EFX_PKT_TCP) ?
356 			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
357 	}
358 	m->m_data += sc->rx_prefix_size;
359 	m->m_len = rx_desc->size - sc->rx_prefix_size;
360 	m->m_pkthdr.len = m->m_len;
361 	m->m_pkthdr.csum_flags = csum_flags;
362 	__sfxge_rx_deliver(sc, rx_desc->mbuf);
363 
364 	rx_desc->flags = EFX_DISCARD;
365 	rx_desc->mbuf = NULL;
366 }
367 
368 #ifdef SFXGE_LRO
369 
370 static void
371 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
372 {
373 	struct sfxge_softc *sc = st->sc;
374 	struct mbuf *m = c->mbuf;
375 	struct tcphdr *c_th;
376 	int csum_flags;
377 
378 	KASSERT(m, ("no mbuf to deliver"));
379 
380 	++st->n_bursts;
381 
382 	/* Finish off packet munging and recalculate IP header checksum. */
383 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
384 		struct ip *iph = c->nh;
385 		iph->ip_len = htons(iph->ip_len);
386 		iph->ip_sum = 0;
387 		iph->ip_sum = in_cksum_hdr(iph);
388 		c_th = (struct tcphdr *)(iph + 1);
389 		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
390 			      CSUM_IP_CHECKED | CSUM_IP_VALID);
391 	} else {
392 		struct ip6_hdr *iph = c->nh;
393 		iph->ip6_plen = htons(iph->ip6_plen);
394 		c_th = (struct tcphdr *)(iph + 1);
395 		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
396 	}
397 
398 	c_th->th_win = c->th_last->th_win;
399 	c_th->th_ack = c->th_last->th_ack;
400 	if (c_th->th_off == c->th_last->th_off) {
401 		/* Copy TCP options (take care to avoid going negative). */
402 		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
403 		memcpy(c_th + 1, c->th_last + 1, optlen);
404 	}
405 
406 	m->m_pkthdr.flowid = c->conn_hash;
407 	M_HASHTYPE_SET(m,
408 	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
409 		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
410 
411 	m->m_pkthdr.csum_flags = csum_flags;
412 	__sfxge_rx_deliver(sc, m);
413 
414 	c->mbuf = NULL;
415 	c->delivered = 1;
416 }
417 
418 /* Drop the given connection, and add it to the free list. */
419 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
420 {
421 	unsigned bucket;
422 
423 	KASSERT(!c->mbuf, ("found orphaned mbuf"));
424 
425 	if (c->next_buf.mbuf != NULL) {
426 		sfxge_rx_deliver(rxq->sc, &c->next_buf);
427 		LIST_REMOVE(c, active_link);
428 	}
429 
430 	bucket = c->conn_hash & rxq->lro.conns_mask;
431 	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
432 	--rxq->lro.conns_n[bucket];
433 	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
434 	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
435 }
436 
437 /* Stop tracking connections that have gone idle in order to keep hash
438  * chains short.
439  */
440 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
441 {
442 	struct sfxge_lro_conn *c;
443 	unsigned i;
444 
445 	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
446 		("found active connections"));
447 
448 	rxq->lro.last_purge_ticks = now;
449 	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
450 		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
451 			continue;
452 
453 		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
454 		if (now - c->last_pkt_ticks > lro_idle_ticks) {
455 			++rxq->lro.n_drop_idle;
456 			sfxge_lro_drop(rxq, c);
457 		}
458 	}
459 }
460 
461 static void
462 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
463 		struct mbuf *mbuf, struct tcphdr *th)
464 {
465 	struct tcphdr *c_th;
466 
467 	/* Tack the new mbuf onto the chain. */
468 	KASSERT(!mbuf->m_next, ("mbuf already chained"));
469 	c->mbuf_tail->m_next = mbuf;
470 	c->mbuf_tail = mbuf;
471 
472 	/* Increase length appropriately */
473 	c->mbuf->m_pkthdr.len += mbuf->m_len;
474 
475 	/* Update the connection state flags */
476 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
477 		struct ip *iph = c->nh;
478 		iph->ip_len += mbuf->m_len;
479 		c_th = (struct tcphdr *)(iph + 1);
480 	} else {
481 		struct ip6_hdr *iph = c->nh;
482 		iph->ip6_plen += mbuf->m_len;
483 		c_th = (struct tcphdr *)(iph + 1);
484 	}
485 	c_th->th_flags |= (th->th_flags & TH_PUSH);
486 	c->th_last = th;
487 	++st->n_merges;
488 
489 	/* Pass packet up now if another segment could overflow the IP
490 	 * length.
491 	 */
492 	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
493 		sfxge_lro_deliver(st, c);
494 }
495 
496 static void
497 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
498 		struct mbuf *mbuf, void *nh, struct tcphdr *th)
499 {
500 	/* Start the chain */
501 	c->mbuf = mbuf;
502 	c->mbuf_tail = c->mbuf;
503 	c->nh = nh;
504 	c->th_last = th;
505 
506 	mbuf->m_pkthdr.len = mbuf->m_len;
507 
508 	/* Mangle header fields for later processing */
509 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
510 		struct ip *iph = nh;
511 		iph->ip_len = ntohs(iph->ip_len);
512 	} else {
513 		struct ip6_hdr *iph = nh;
514 		iph->ip6_plen = ntohs(iph->ip6_plen);
515 	}
516 }
517 
518 /* Try to merge or otherwise hold or deliver (as appropriate) the
519  * packet buffered for this connection (c->next_buf).  Return a flag
520  * indicating whether the connection is still active for LRO purposes.
521  */
522 static int
523 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
524 {
525 	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
526 	char *eh = c->next_eh;
527 	int data_length, hdr_length, dont_merge;
528 	unsigned th_seq, pkt_length;
529 	struct tcphdr *th;
530 	unsigned now;
531 
532 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
533 		struct ip *iph = c->next_nh;
534 		th = (struct tcphdr *)(iph + 1);
535 		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
536 	} else {
537 		struct ip6_hdr *iph = c->next_nh;
538 		th = (struct tcphdr *)(iph + 1);
539 		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
540 	}
541 
542 	hdr_length = (char *) th + th->th_off * 4 - eh;
543 	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
544 		       hdr_length);
545 	th_seq = ntohl(th->th_seq);
546 	dont_merge = ((data_length <= 0)
547 		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
548 
549 	/* Check for options other than aligned timestamp. */
550 	if (th->th_off != 5) {
551 		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
552 		if (th->th_off == 8 &&
553 		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
554 					(TCPOPT_NOP << 16) |
555 					(TCPOPT_TIMESTAMP << 8) |
556 					TCPOLEN_TIMESTAMP)) {
557 			/* timestamp option -- okay */
558 		} else {
559 			dont_merge = 1;
560 		}
561 	}
562 
563 	if (__predict_false(th_seq != c->next_seq)) {
564 		/* Out-of-order, so start counting again. */
565 		if (c->mbuf != NULL)
566 			sfxge_lro_deliver(&rxq->lro, c);
567 		c->n_in_order_pkts -= lro_loss_packets;
568 		c->next_seq = th_seq + data_length;
569 		++rxq->lro.n_misorder;
570 		goto deliver_buf_out;
571 	}
572 	c->next_seq = th_seq + data_length;
573 
574 	now = ticks;
575 	if (now - c->last_pkt_ticks > lro_idle_ticks) {
576 		++rxq->lro.n_drop_idle;
577 		if (c->mbuf != NULL)
578 			sfxge_lro_deliver(&rxq->lro, c);
579 		sfxge_lro_drop(rxq, c);
580 		return (0);
581 	}
582 	c->last_pkt_ticks = ticks;
583 
584 	if (c->n_in_order_pkts < lro_slow_start_packets) {
585 		/* May be in slow-start, so don't merge. */
586 		++rxq->lro.n_slow_start;
587 		++c->n_in_order_pkts;
588 		goto deliver_buf_out;
589 	}
590 
591 	if (__predict_false(dont_merge)) {
592 		if (c->mbuf != NULL)
593 			sfxge_lro_deliver(&rxq->lro, c);
594 		if (th->th_flags & (TH_FIN | TH_RST)) {
595 			++rxq->lro.n_drop_closed;
596 			sfxge_lro_drop(rxq, c);
597 			return (0);
598 		}
599 		goto deliver_buf_out;
600 	}
601 
602 	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
603 
604 	if (__predict_true(c->mbuf != NULL)) {
605 		/* Remove headers and any padding */
606 		rx_buf->mbuf->m_data += hdr_length;
607 		rx_buf->mbuf->m_len = data_length;
608 
609 		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
610 	} else {
611 		/* Remove any padding */
612 		rx_buf->mbuf->m_len = pkt_length;
613 
614 		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
615 	}
616 
617 	rx_buf->mbuf = NULL;
618 	return (1);
619 
620  deliver_buf_out:
621 	sfxge_rx_deliver(rxq->sc, rx_buf);
622 	return (1);
623 }
624 
625 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
626 			       uint16_t l2_id, void *nh, struct tcphdr *th)
627 {
628 	unsigned bucket = conn_hash & st->conns_mask;
629 	struct sfxge_lro_conn *c;
630 
631 	if (st->conns_n[bucket] >= lro_chain_max) {
632 		++st->n_too_many;
633 		return;
634 	}
635 
636 	if (!TAILQ_EMPTY(&st->free_conns)) {
637 		c = TAILQ_FIRST(&st->free_conns);
638 		TAILQ_REMOVE(&st->free_conns, c, link);
639 	} else {
640 		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
641 		if (c == NULL)
642 			return;
643 		c->mbuf = NULL;
644 		c->next_buf.mbuf = NULL;
645 	}
646 
647 	/* Create the connection tracking data */
648 	++st->conns_n[bucket];
649 	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
650 	c->l2_id = l2_id;
651 	c->conn_hash = conn_hash;
652 	c->source = th->th_sport;
653 	c->dest = th->th_dport;
654 	c->n_in_order_pkts = 0;
655 	c->last_pkt_ticks = *(volatile int *)&ticks;
656 	c->delivered = 0;
657 	++st->n_new_stream;
658 	/* NB. We don't initialise c->next_seq, and it doesn't matter what
659 	 * value it has.  Most likely the next packet received for this
660 	 * connection will not match -- no harm done.
661 	 */
662 }
663 
664 /* Process mbuf and decide whether to dispatch it to the stack now or
665  * later.
666  */
667 static void
668 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
669 {
670 	struct sfxge_softc *sc = rxq->sc;
671 	struct mbuf *m = rx_buf->mbuf;
672 	struct ether_header *eh;
673 	struct sfxge_lro_conn *c;
674 	uint16_t l2_id;
675 	uint16_t l3_proto;
676 	void *nh;
677 	struct tcphdr *th;
678 	uint32_t conn_hash;
679 	unsigned bucket;
680 
681 	/* Get the hardware hash */
682 	conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
683 					    EFX_RX_HASHALG_TOEPLITZ,
684 					    mtod(m, uint8_t *));
685 
686 	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
687 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
688 		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
689 		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
690 			SFXGE_LRO_L2_ID_VLAN;
691 		l3_proto = veh->evl_proto;
692 		nh = veh + 1;
693 	} else {
694 		l2_id = 0;
695 		l3_proto = eh->ether_type;
696 		nh = eh + 1;
697 	}
698 
699 	/* Check whether this is a suitable packet (unfragmented
700 	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
701 	 * length, and compute a hash if necessary.  If not, return.
702 	 */
703 	if (l3_proto == htons(ETHERTYPE_IP)) {
704 		struct ip *iph = nh;
705 
706 		KASSERT(iph->ip_p == IPPROTO_TCP,
707 		    ("IPv4 protocol is not TCP, but packet marker is set"));
708 		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
709 		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
710 			goto deliver_now;
711 		th = (struct tcphdr *)(iph + 1);
712 	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
713 		struct ip6_hdr *iph = nh;
714 
715 		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
716 		    ("IPv6 next header is not TCP, but packet marker is set"));
717 		l2_id |= SFXGE_LRO_L2_ID_IPV6;
718 		th = (struct tcphdr *)(iph + 1);
719 	} else {
720 		goto deliver_now;
721 	}
722 
723 	bucket = conn_hash & rxq->lro.conns_mask;
724 
725 	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
726 		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
727 			continue;
728 		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
729 			continue;
730 		if (c->mbuf != NULL) {
731 			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
732 				struct ip *c_iph, *iph = nh;
733 				c_iph = c->nh;
734 				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
735 				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
736 					continue;
737 			} else {
738 				struct ip6_hdr *c_iph, *iph = nh;
739 				c_iph = c->nh;
740 				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
741 				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
742 					continue;
743 			}
744 		}
745 
746 		/* Re-insert at head of list to reduce lookup time. */
747 		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
748 		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
749 
750 		if (c->next_buf.mbuf != NULL) {
751 			if (!sfxge_lro_try_merge(rxq, c))
752 				goto deliver_now;
753 		} else {
754 			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
755 			    active_link);
756 		}
757 		c->next_buf = *rx_buf;
758 		c->next_eh = eh;
759 		c->next_nh = nh;
760 
761 		rx_buf->mbuf = NULL;
762 		rx_buf->flags = EFX_DISCARD;
763 		return;
764 	}
765 
766 	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
767  deliver_now:
768 	sfxge_rx_deliver(sc, rx_buf);
769 }
770 
771 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
772 {
773 	struct sfxge_lro_state *st = &rxq->lro;
774 	struct sfxge_lro_conn *c;
775 	unsigned t;
776 
777 	while (!LIST_EMPTY(&st->active_conns)) {
778 		c = LIST_FIRST(&st->active_conns);
779 		if (!c->delivered && c->mbuf != NULL)
780 			sfxge_lro_deliver(st, c);
781 		if (sfxge_lro_try_merge(rxq, c)) {
782 			if (c->mbuf != NULL)
783 				sfxge_lro_deliver(st, c);
784 			LIST_REMOVE(c, active_link);
785 		}
786 		c->delivered = 0;
787 	}
788 
789 	t = *(volatile int *)&ticks;
790 	if (__predict_false(t != st->last_purge_ticks))
791 		sfxge_lro_purge_idle(rxq, t);
792 }
793 
794 #else	/* !SFXGE_LRO */
795 
796 static void
797 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
798 {
799 }
800 
801 static void
802 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
803 {
804 }
805 
806 #endif	/* SFXGE_LRO */
807 
808 void
809 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
810 {
811 	struct sfxge_softc *sc = rxq->sc;
812 	int if_capenable = sc->ifnet->if_capenable;
813 	int lro_enabled = if_capenable & IFCAP_LRO;
814 	unsigned int index;
815 	struct sfxge_evq *evq;
816 	unsigned int completed;
817 	unsigned int level;
818 	struct mbuf *m;
819 	struct sfxge_rx_sw_desc *prev = NULL;
820 
821 	index = rxq->index;
822 	evq = sc->evq[index];
823 
824 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
825 
826 	completed = rxq->completed;
827 	while (completed != rxq->pending) {
828 		unsigned int id;
829 		struct sfxge_rx_sw_desc *rx_desc;
830 
831 		id = completed++ & rxq->ptr_mask;
832 		rx_desc = &rxq->queue[id];
833 		m = rx_desc->mbuf;
834 
835 		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
836 			goto discard;
837 
838 		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
839 			goto discard;
840 
841 		/* Read the length from the pseudo header if required */
842 		if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
843 			uint16_t tmp_size;
844 			int rc;
845 			rc = efx_psuedo_hdr_pkt_length_get(sc->enp,
846 							   mtod(m, uint8_t *),
847 							   &tmp_size);
848 			KASSERT(rc == 0, ("cannot get packet length: %d", rc));
849 			rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
850 		}
851 
852 		prefetch_read_many(mtod(m, caddr_t));
853 
854 		switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
855 		case EFX_PKT_IPV4:
856 			if (~if_capenable & IFCAP_RXCSUM)
857 				rx_desc->flags &=
858 				    ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
859 			break;
860 		case EFX_PKT_IPV6:
861 			if (~if_capenable & IFCAP_RXCSUM_IPV6)
862 				rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
863 			break;
864 		case 0:
865 			/* Check for loopback packets */
866 			{
867 				struct ether_header *etherhp;
868 
869 				/*LINTED*/
870 				etherhp = mtod(m, struct ether_header *);
871 
872 				if (etherhp->ether_type ==
873 				    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
874 					EFSYS_PROBE(loopback);
875 
876 					rxq->loopback++;
877 					goto discard;
878 				}
879 			}
880 			break;
881 		default:
882 			KASSERT(B_FALSE,
883 			    ("Rx descriptor with both IPv4 and IPv6 flags"));
884 			goto discard;
885 		}
886 
887 		/* Pass packet up the stack or into LRO (pipelined) */
888 		if (prev != NULL) {
889 			if (lro_enabled &&
890 			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
891 			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
892 				sfxge_lro(rxq, prev);
893 			else
894 				sfxge_rx_deliver(sc, prev);
895 		}
896 		prev = rx_desc;
897 		continue;
898 
899 discard:
900 		/* Return the packet to the pool */
901 		m_free(m);
902 		rx_desc->mbuf = NULL;
903 	}
904 	rxq->completed = completed;
905 
906 	level = rxq->added - rxq->completed;
907 
908 	/* Pass last packet up the stack or into LRO */
909 	if (prev != NULL) {
910 		if (lro_enabled &&
911 		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
912 		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
913 			sfxge_lro(rxq, prev);
914 		else
915 			sfxge_rx_deliver(sc, prev);
916 	}
917 
918 	/*
919 	 * If there are any pending flows and this is the end of the
920 	 * poll then they must be completed.
921 	 */
922 	if (eop)
923 		sfxge_lro_end_of_burst(rxq);
924 
925 	/* Top up the queue if necessary */
926 	if (level < rxq->refill_threshold)
927 		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
928 }
929 
930 static void
931 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
932 {
933 	struct sfxge_rxq *rxq;
934 	struct sfxge_evq *evq;
935 	unsigned int count;
936 	unsigned int retry = 3;
937 
938 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
939 
940 	rxq = sc->rxq[index];
941 	evq = sc->evq[index];
942 
943 	SFXGE_EVQ_LOCK(evq);
944 
945 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
946 	    ("rxq not started"));
947 
948 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
949 
950 	callout_stop(&rxq->refill_callout);
951 
952 	while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
953 		rxq->flush_state = SFXGE_FLUSH_PENDING;
954 
955 		SFXGE_EVQ_UNLOCK(evq);
956 
957 		/* Flush the receive queue */
958 		if (efx_rx_qflush(rxq->common) != 0) {
959 			SFXGE_EVQ_LOCK(evq);
960 			rxq->flush_state = SFXGE_FLUSH_FAILED;
961 			break;
962 		}
963 
964 		count = 0;
965 		do {
966 			/* Spin for 100 ms */
967 			DELAY(100000);
968 
969 			if (rxq->flush_state != SFXGE_FLUSH_PENDING)
970 				break;
971 
972 		} while (++count < 20);
973 
974 		SFXGE_EVQ_LOCK(evq);
975 
976 		if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
977 			/* Flush timeout - neither done nor failed */
978 			log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
979 			    device_get_nameunit(sc->dev), index);
980 			rxq->flush_state = SFXGE_FLUSH_DONE;
981 		}
982 		retry--;
983 	}
984 	if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
985 		log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
986 		    device_get_nameunit(sc->dev), index);
987 		rxq->flush_state = SFXGE_FLUSH_DONE;
988 	}
989 
990 	rxq->pending = rxq->added;
991 	sfxge_rx_qcomplete(rxq, B_TRUE);
992 
993 	KASSERT(rxq->completed == rxq->pending,
994 	    ("rxq->completed != rxq->pending"));
995 
996 	rxq->added = 0;
997 	rxq->pushed = 0;
998 	rxq->pending = 0;
999 	rxq->completed = 0;
1000 	rxq->loopback = 0;
1001 
1002 	/* Destroy the common code receive queue. */
1003 	efx_rx_qdestroy(rxq->common);
1004 
1005 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1006 	    EFX_RXQ_NBUFS(sc->rxq_entries));
1007 
1008 	SFXGE_EVQ_UNLOCK(evq);
1009 }
1010 
1011 static int
1012 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1013 {
1014 	struct sfxge_rxq *rxq;
1015 	efsys_mem_t *esmp;
1016 	struct sfxge_evq *evq;
1017 	int rc;
1018 
1019 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1020 
1021 	rxq = sc->rxq[index];
1022 	esmp = &rxq->mem;
1023 	evq = sc->evq[index];
1024 
1025 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1026 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1027 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1028 	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1029 
1030 	/* Program the buffer table. */
1031 	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1032 	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1033 		return (rc);
1034 
1035 	/* Create the common code receive queue. */
1036 	if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1037 	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1038 	    &rxq->common)) != 0)
1039 		goto fail;
1040 
1041 	SFXGE_EVQ_LOCK(evq);
1042 
1043 	/* Enable the receive queue. */
1044 	efx_rx_qenable(rxq->common);
1045 
1046 	rxq->init_state = SFXGE_RXQ_STARTED;
1047 	rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1048 
1049 	/* Try to fill the queue from the pool. */
1050 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1051 
1052 	SFXGE_EVQ_UNLOCK(evq);
1053 
1054 	return (0);
1055 
1056 fail:
1057 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1058 	    EFX_RXQ_NBUFS(sc->rxq_entries));
1059 	return (rc);
1060 }
1061 
1062 void
1063 sfxge_rx_stop(struct sfxge_softc *sc)
1064 {
1065 	int index;
1066 
1067 	efx_mac_filter_default_rxq_clear(sc->enp);
1068 
1069 	/* Stop the receive queue(s) */
1070 	index = sc->rxq_count;
1071 	while (--index >= 0)
1072 		sfxge_rx_qstop(sc, index);
1073 
1074 	sc->rx_prefix_size = 0;
1075 	sc->rx_buffer_size = 0;
1076 
1077 	efx_rx_fini(sc->enp);
1078 }
1079 
1080 int
1081 sfxge_rx_start(struct sfxge_softc *sc)
1082 {
1083 	struct sfxge_intr *intr;
1084 	const efx_nic_cfg_t *encp;
1085 	size_t hdrlen, align, reserved;
1086 	int index;
1087 	int rc;
1088 
1089 	intr = &sc->intr;
1090 
1091 	/* Initialize the common code receive module. */
1092 	if ((rc = efx_rx_init(sc->enp)) != 0)
1093 		return (rc);
1094 
1095 	encp = efx_nic_cfg_get(sc->enp);
1096 	sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1097 
1098 	/* Calculate the receive packet buffer size. */
1099 	sc->rx_prefix_size = encp->enc_rx_prefix_size;
1100 
1101 	/* Ensure IP headers are 32bit aligned */
1102 	hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1103 	sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1104 
1105 	sc->rx_buffer_size += sc->rx_buffer_align;
1106 
1107 	/* Align end of packet buffer for RX DMA end padding */
1108 	align = MAX(1, encp->enc_rx_buf_align_end);
1109 	EFSYS_ASSERT(ISP2(align));
1110 	sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1111 
1112 	/*
1113 	 * Standard mbuf zones only guarantee pointer-size alignment;
1114 	 * we need extra space to align to the cache line
1115 	 */
1116 	reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1117 
1118 	/* Select zone for packet buffers */
1119 	if (reserved <= MCLBYTES)
1120 		sc->rx_cluster_size = MCLBYTES;
1121 	else if (reserved <= MJUMPAGESIZE)
1122 		sc->rx_cluster_size = MJUMPAGESIZE;
1123 	else if (reserved <= MJUM9BYTES)
1124 		sc->rx_cluster_size = MJUM9BYTES;
1125 	else
1126 		sc->rx_cluster_size = MJUM16BYTES;
1127 
1128 	/*
1129 	 * Set up the scale table.  Enable all hash types and hash insertion.
1130 	 */
1131 	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1132 #ifdef RSS
1133 		sc->rx_indir_table[index] =
1134 			rss_get_indirection_to_bucket(index) % sc->rxq_count;
1135 #else
1136 		sc->rx_indir_table[index] = index % sc->rxq_count;
1137 #endif
1138 	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1139 				       SFXGE_RX_SCALE_MAX)) != 0)
1140 		goto fail;
1141 	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1142 	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1143 	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1144 
1145 #ifdef RSS
1146 	rss_getkey(toep_key);
1147 #endif
1148 	if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1149 				       sizeof(toep_key))) != 0)
1150 		goto fail;
1151 
1152 	/* Start the receive queue(s). */
1153 	for (index = 0; index < sc->rxq_count; index++) {
1154 		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1155 			goto fail2;
1156 	}
1157 
1158 	rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1159 					    sc->intr.n_alloc > 1);
1160 	if (rc != 0)
1161 		goto fail3;
1162 
1163 	return (0);
1164 
1165 fail3:
1166 fail2:
1167 	while (--index >= 0)
1168 		sfxge_rx_qstop(sc, index);
1169 
1170 fail:
1171 	efx_rx_fini(sc->enp);
1172 
1173 	return (rc);
1174 }
1175 
1176 #ifdef SFXGE_LRO
1177 
1178 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1179 {
1180 	struct sfxge_lro_state *st = &rxq->lro;
1181 	unsigned i;
1182 
1183 	st->conns_mask = lro_table_size - 1;
1184 	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1185 		("lro_table_size must be a power of 2"));
1186 	st->sc = rxq->sc;
1187 	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1188 			   M_SFXGE, M_WAITOK);
1189 	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1190 			     M_SFXGE, M_WAITOK);
1191 	for (i = 0; i <= st->conns_mask; ++i) {
1192 		TAILQ_INIT(&st->conns[i]);
1193 		st->conns_n[i] = 0;
1194 	}
1195 	LIST_INIT(&st->active_conns);
1196 	TAILQ_INIT(&st->free_conns);
1197 }
1198 
1199 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1200 {
1201 	struct sfxge_lro_state *st = &rxq->lro;
1202 	struct sfxge_lro_conn *c;
1203 	unsigned i;
1204 
1205 	/* Return cleanly if sfxge_lro_init() has not been called. */
1206 	if (st->conns == NULL)
1207 		return;
1208 
1209 	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1210 
1211 	for (i = 0; i <= st->conns_mask; ++i) {
1212 		while (!TAILQ_EMPTY(&st->conns[i])) {
1213 			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1214 			sfxge_lro_drop(rxq, c);
1215 		}
1216 	}
1217 
1218 	while (!TAILQ_EMPTY(&st->free_conns)) {
1219 		c = TAILQ_FIRST(&st->free_conns);
1220 		TAILQ_REMOVE(&st->free_conns, c, link);
1221 		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1222 		free(c, M_SFXGE);
1223 	}
1224 
1225 	free(st->conns_n, M_SFXGE);
1226 	free(st->conns, M_SFXGE);
1227 	st->conns = NULL;
1228 }
1229 
1230 #else
1231 
1232 static void
1233 sfxge_lro_init(struct sfxge_rxq *rxq)
1234 {
1235 }
1236 
1237 static void
1238 sfxge_lro_fini(struct sfxge_rxq *rxq)
1239 {
1240 }
1241 
1242 #endif	/* SFXGE_LRO */
1243 
1244 static void
1245 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1246 {
1247 	struct sfxge_rxq *rxq;
1248 
1249 	rxq = sc->rxq[index];
1250 
1251 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1252 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1253 
1254 	/* Free the context array and the flow table. */
1255 	free(rxq->queue, M_SFXGE);
1256 	sfxge_lro_fini(rxq);
1257 
1258 	/* Release DMA memory. */
1259 	sfxge_dma_free(&rxq->mem);
1260 
1261 	sc->rxq[index] = NULL;
1262 
1263 	free(rxq, M_SFXGE);
1264 }
1265 
1266 static int
1267 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1268 {
1269 	struct sfxge_rxq *rxq;
1270 	struct sfxge_evq *evq;
1271 	efsys_mem_t *esmp;
1272 	int rc;
1273 
1274 	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1275 
1276 	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1277 	rxq->sc = sc;
1278 	rxq->index = index;
1279 	rxq->entries = sc->rxq_entries;
1280 	rxq->ptr_mask = rxq->entries - 1;
1281 	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1282 
1283 	sc->rxq[index] = rxq;
1284 	esmp = &rxq->mem;
1285 
1286 	evq = sc->evq[index];
1287 
1288 	/* Allocate and zero DMA space. */
1289 	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1290 		return (rc);
1291 
1292 	/* Allocate buffer table entries. */
1293 	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1294 				 &rxq->buf_base_id);
1295 
1296 	/* Allocate the context array and the flow table. */
1297 	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1298 	    M_SFXGE, M_WAITOK | M_ZERO);
1299 	sfxge_lro_init(rxq);
1300 
1301 	callout_init(&rxq->refill_callout, 1);
1302 
1303 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1304 
1305 	return (0);
1306 }
1307 
1308 static const struct {
1309 	const char *name;
1310 	size_t offset;
1311 } sfxge_rx_stats[] = {
1312 #define	SFXGE_RX_STAT(name, member) \
1313 	{ #name, offsetof(struct sfxge_rxq, member) }
1314 #ifdef SFXGE_LRO
1315 	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1316 	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1317 	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1318 	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1319 	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1320 	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1321 	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1322 	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1323 #endif
1324 };
1325 
1326 static int
1327 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1328 {
1329 	struct sfxge_softc *sc = arg1;
1330 	unsigned int id = arg2;
1331 	unsigned int sum, index;
1332 
1333 	/* Sum across all RX queues */
1334 	sum = 0;
1335 	for (index = 0; index < sc->rxq_count; index++)
1336 		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1337 					 sfxge_rx_stats[id].offset);
1338 
1339 	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1340 }
1341 
1342 static void
1343 sfxge_rx_stat_init(struct sfxge_softc *sc)
1344 {
1345 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1346 	struct sysctl_oid_list *stat_list;
1347 	unsigned int id;
1348 
1349 	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1350 
1351 	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1352 		SYSCTL_ADD_PROC(
1353 			ctx, stat_list,
1354 			OID_AUTO, sfxge_rx_stats[id].name,
1355 			CTLTYPE_UINT|CTLFLAG_RD,
1356 			sc, id, sfxge_rx_stat_handler, "IU",
1357 			"");
1358 	}
1359 }
1360 
1361 void
1362 sfxge_rx_fini(struct sfxge_softc *sc)
1363 {
1364 	int index;
1365 
1366 	index = sc->rxq_count;
1367 	while (--index >= 0)
1368 		sfxge_rx_qfini(sc, index);
1369 
1370 	sc->rxq_count = 0;
1371 }
1372 
1373 int
1374 sfxge_rx_init(struct sfxge_softc *sc)
1375 {
1376 	struct sfxge_intr *intr;
1377 	int index;
1378 	int rc;
1379 
1380 #ifdef SFXGE_LRO
1381 	if (!ISP2(lro_table_size)) {
1382 		log(LOG_ERR, "%s=%u must be power of 2",
1383 		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1384 		rc = EINVAL;
1385 		goto fail_lro_table_size;
1386 	}
1387 
1388 	if (lro_idle_ticks == 0)
1389 		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1390 #endif
1391 
1392 	intr = &sc->intr;
1393 
1394 	sc->rxq_count = intr->n_alloc;
1395 
1396 	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1397 	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1398 
1399 	/* Initialize the receive queue(s) - one per interrupt. */
1400 	for (index = 0; index < sc->rxq_count; index++) {
1401 		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1402 			goto fail;
1403 	}
1404 
1405 	sfxge_rx_stat_init(sc);
1406 
1407 	return (0);
1408 
1409 fail:
1410 	/* Tear down the receive queue(s). */
1411 	while (--index >= 0)
1412 		sfxge_rx_qfini(sc, index);
1413 
1414 	sc->rxq_count = 0;
1415 
1416 #ifdef SFXGE_LRO
1417 fail_lro_table_size:
1418 #endif
1419 	return (rc);
1420 }
1421