xref: /freebsd/sys/dev/sfxge/sfxge_rx.c (revision e27abb6689c5733dd08ce240d5402a0de3a42254)
1 /*-
2  * Copyright (c) 2010-2016 Solarflare Communications Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright notice,
12  *    this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  *    this list of conditions and the following disclaimer in the documentation
15  *    and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  * The views and conclusions contained in the software and documentation are
30  * those of the authors and should not be interpreted as representing official
31  * policies, either expressed or implied, of the FreeBSD Project.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/param.h>
38 #include <sys/malloc.h>
39 #include <sys/mbuf.h>
40 #include <sys/smp.h>
41 #include <sys/socket.h>
42 #include <sys/sysctl.h>
43 #include <sys/syslog.h>
44 #include <sys/limits.h>
45 #include <sys/syslog.h>
46 
47 #include <net/ethernet.h>
48 #include <net/if.h>
49 #include <net/if_vlan_var.h>
50 
51 #include <netinet/in.h>
52 #include <netinet/ip.h>
53 #include <netinet/ip6.h>
54 #include <netinet/tcp.h>
55 
56 #include <machine/in_cksum.h>
57 
58 #include "common/efx.h"
59 
60 
61 #include "sfxge.h"
62 #include "sfxge_rx.h"
63 
64 #define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
65 
66 #ifdef SFXGE_LRO
67 
68 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
69 	    "Large receive offload (LRO) parameters");
70 
71 #define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
72 
73 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
74  * means we can accelerate a larger number of streams.
75  */
76 static unsigned lro_table_size = 128;
77 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
78 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
79 	    &lro_table_size, 0,
80 	    "Size of the LRO hash table (must be a power of 2)");
81 
82 /* Maximum length of a hash chain.  If chains get too long then the lookup
83  * time increases and may exceed the benefit of LRO.
84  */
85 static unsigned lro_chain_max = 20;
86 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
87 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
88 	    &lro_chain_max, 0,
89 	    "The maximum length of a hash chain");
90 
91 /* Maximum time (in ticks) that a connection can be idle before it's LRO
92  * state is discarded.
93  */
94 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
95 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
96 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
97 	    &lro_idle_ticks, 0,
98 	    "The maximum time (in ticks) that a connection can be idle "
99 	    "before it's LRO state is discarded");
100 
101 /* Number of packets with payload that must arrive in-order before a
102  * connection is eligible for LRO.  The idea is we should avoid coalescing
103  * segments when the sender is in slow-start because reducing the ACK rate
104  * can damage performance.
105  */
106 static int lro_slow_start_packets = 2000;
107 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
108 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
109 	    &lro_slow_start_packets, 0,
110 	    "Number of packets with payload that must arrive in-order before "
111 	    "a connection is eligible for LRO");
112 
113 /* Number of packets with payload that must arrive in-order following loss
114  * before a connection is eligible for LRO.  The idea is we should avoid
115  * coalescing segments when the sender is recovering from loss, because
116  * reducing the ACK rate can damage performance.
117  */
118 static int lro_loss_packets = 20;
119 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
120 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
121 	    &lro_loss_packets, 0,
122 	    "Number of packets with payload that must arrive in-order "
123 	    "following loss before a connection is eligible for LRO");
124 
125 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
126 #define	SFXGE_LRO_L2_ID_VLAN 0x4000
127 #define	SFXGE_LRO_L2_ID_IPV6 0x8000
128 #define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
129 #define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
130 
131 /* Compare IPv6 addresses, avoiding conditional branches */
132 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
133 				   const struct in6_addr *right)
134 {
135 #if LONG_BIT == 64
136 	const uint64_t *left64 = (const uint64_t *)left;
137 	const uint64_t *right64 = (const uint64_t *)right;
138 	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
139 #else
140 	return (left->s6_addr32[0] - right->s6_addr32[0]) |
141 	       (left->s6_addr32[1] - right->s6_addr32[1]) |
142 	       (left->s6_addr32[2] - right->s6_addr32[2]) |
143 	       (left->s6_addr32[3] - right->s6_addr32[3]);
144 #endif
145 }
146 
147 #endif	/* SFXGE_LRO */
148 
149 void
150 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
151 {
152 
153 	rxq->flush_state = SFXGE_FLUSH_DONE;
154 }
155 
156 void
157 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
158 {
159 
160 	rxq->flush_state = SFXGE_FLUSH_FAILED;
161 }
162 
163 static uint8_t toep_key[] = {
164 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
165 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
166 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
167 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
168 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
169 };
170 
171 static void
172 sfxge_rx_post_refill(void *arg)
173 {
174 	struct sfxge_rxq *rxq = arg;
175 	struct sfxge_softc *sc;
176 	unsigned int index;
177 	struct sfxge_evq *evq;
178 	uint16_t magic;
179 
180 	sc = rxq->sc;
181 	index = rxq->index;
182 	evq = sc->evq[index];
183 	magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
184 
185 	/* This is guaranteed due to the start/stop order of rx and ev */
186 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
187 	    ("evq not started"));
188 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
189 	    ("rxq not started"));
190 	efx_ev_qpost(evq->common, magic);
191 }
192 
193 static void
194 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
195 {
196 	/* Initially retry after 100 ms, but back off in case of
197 	 * repeated failures as we probably have to wait for the
198 	 * administrator to raise the pool limit. */
199 	if (retrying)
200 		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
201 	else
202 		rxq->refill_delay = hz / 10;
203 
204 	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
205 			     sfxge_rx_post_refill, rxq);
206 }
207 
208 #define	SFXGE_REFILL_BATCH  64
209 
210 static void
211 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
212 {
213 	struct sfxge_softc *sc;
214 	unsigned int index;
215 	struct sfxge_evq *evq;
216 	unsigned int batch;
217 	unsigned int rxfill;
218 	unsigned int mblksize;
219 	int ntodo;
220 	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
221 
222 	sc = rxq->sc;
223 	index = rxq->index;
224 	evq = sc->evq[index];
225 
226 	prefetch_read_many(sc->enp);
227 	prefetch_read_many(rxq->common);
228 
229 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
230 
231 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
232 		return;
233 
234 	rxfill = rxq->added - rxq->completed;
235 	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
236 	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
237 	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
238 	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
239 	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
240 
241 	if (ntodo == 0)
242 		return;
243 
244 	batch = 0;
245 	mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
246 	while (ntodo-- > 0) {
247 		unsigned int id;
248 		struct sfxge_rx_sw_desc *rx_desc;
249 		bus_dma_segment_t seg;
250 		struct mbuf *m;
251 
252 		id = (rxq->added + batch) & rxq->ptr_mask;
253 		rx_desc = &rxq->queue[id];
254 		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
255 
256 		rx_desc->flags = EFX_DISCARD;
257 		m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
258 		    sc->rx_cluster_size);
259 		if (m == NULL)
260 			break;
261 
262 		/* m_len specifies length of area to be mapped for DMA */
263 		m->m_len  = mblksize;
264 		m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
265 		m->m_data += sc->rx_buffer_align;
266 
267 		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
268 		addr[batch++] = seg.ds_addr;
269 
270 		if (batch == SFXGE_REFILL_BATCH) {
271 			efx_rx_qpost(rxq->common, addr, mblksize, batch,
272 			    rxq->completed, rxq->added);
273 			rxq->added += batch;
274 			batch = 0;
275 		}
276 	}
277 
278 	if (ntodo != 0)
279 		sfxge_rx_schedule_refill(rxq, retrying);
280 
281 	if (batch != 0) {
282 		efx_rx_qpost(rxq->common, addr, mblksize, batch,
283 		    rxq->completed, rxq->added);
284 		rxq->added += batch;
285 	}
286 
287 	/* Make the descriptors visible to the hardware */
288 	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
289 			BUS_DMASYNC_PREWRITE);
290 
291 	efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
292 
293 	/* The queue could still be empty if no descriptors were actually
294 	 * pushed, in which case there will be no event to cause the next
295 	 * refill, so we must schedule a refill ourselves.
296 	 */
297 	if(rxq->pushed == rxq->completed) {
298 		sfxge_rx_schedule_refill(rxq, retrying);
299 	}
300 }
301 
302 void
303 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
304 {
305 
306 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
307 		return;
308 
309 	/* Make sure the queue is full */
310 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
311 }
312 
313 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
314 {
315 	struct ifnet *ifp = sc->ifnet;
316 
317 	m->m_pkthdr.rcvif = ifp;
318 	m->m_pkthdr.csum_data = 0xffff;
319 	ifp->if_input(ifp, m);
320 }
321 
322 static void
323 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
324 {
325 	struct mbuf *m = rx_desc->mbuf;
326 	int flags = rx_desc->flags;
327 	int csum_flags;
328 
329 	/* Convert checksum flags */
330 	csum_flags = (flags & EFX_CKSUM_IPV4) ?
331 		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
332 	if (flags & EFX_CKSUM_TCPUDP)
333 		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
334 
335 	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
336 		m->m_pkthdr.flowid =
337 			efx_psuedo_hdr_hash_get(sc->enp,
338 						EFX_RX_HASHALG_TOEPLITZ,
339 						mtod(m, uint8_t *));
340 		/* The hash covers a 4-tuple for TCP only */
341 		M_HASHTYPE_SET(m,
342 		    (flags & EFX_PKT_IPV4) ?
343 			((flags & EFX_PKT_TCP) ?
344 			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
345 			((flags & EFX_PKT_TCP) ?
346 			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
347 	}
348 	m->m_data += sc->rx_prefix_size;
349 	m->m_len = rx_desc->size - sc->rx_prefix_size;
350 	m->m_pkthdr.len = m->m_len;
351 	m->m_pkthdr.csum_flags = csum_flags;
352 	__sfxge_rx_deliver(sc, rx_desc->mbuf);
353 
354 	rx_desc->flags = EFX_DISCARD;
355 	rx_desc->mbuf = NULL;
356 }
357 
358 #ifdef SFXGE_LRO
359 
360 static void
361 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
362 {
363 	struct sfxge_softc *sc = st->sc;
364 	struct mbuf *m = c->mbuf;
365 	struct tcphdr *c_th;
366 	int csum_flags;
367 
368 	KASSERT(m, ("no mbuf to deliver"));
369 
370 	++st->n_bursts;
371 
372 	/* Finish off packet munging and recalculate IP header checksum. */
373 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
374 		struct ip *iph = c->nh;
375 		iph->ip_len = htons(iph->ip_len);
376 		iph->ip_sum = 0;
377 		iph->ip_sum = in_cksum_hdr(iph);
378 		c_th = (struct tcphdr *)(iph + 1);
379 		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
380 			      CSUM_IP_CHECKED | CSUM_IP_VALID);
381 	} else {
382 		struct ip6_hdr *iph = c->nh;
383 		iph->ip6_plen = htons(iph->ip6_plen);
384 		c_th = (struct tcphdr *)(iph + 1);
385 		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
386 	}
387 
388 	c_th->th_win = c->th_last->th_win;
389 	c_th->th_ack = c->th_last->th_ack;
390 	if (c_th->th_off == c->th_last->th_off) {
391 		/* Copy TCP options (take care to avoid going negative). */
392 		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
393 		memcpy(c_th + 1, c->th_last + 1, optlen);
394 	}
395 
396 	m->m_pkthdr.flowid = c->conn_hash;
397 	M_HASHTYPE_SET(m,
398 	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
399 		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
400 
401 	m->m_pkthdr.csum_flags = csum_flags;
402 	__sfxge_rx_deliver(sc, m);
403 
404 	c->mbuf = NULL;
405 	c->delivered = 1;
406 }
407 
408 /* Drop the given connection, and add it to the free list. */
409 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
410 {
411 	unsigned bucket;
412 
413 	KASSERT(!c->mbuf, ("found orphaned mbuf"));
414 
415 	if (c->next_buf.mbuf != NULL) {
416 		sfxge_rx_deliver(rxq->sc, &c->next_buf);
417 		LIST_REMOVE(c, active_link);
418 	}
419 
420 	bucket = c->conn_hash & rxq->lro.conns_mask;
421 	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
422 	--rxq->lro.conns_n[bucket];
423 	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
424 	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
425 }
426 
427 /* Stop tracking connections that have gone idle in order to keep hash
428  * chains short.
429  */
430 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
431 {
432 	struct sfxge_lro_conn *c;
433 	unsigned i;
434 
435 	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
436 		("found active connections"));
437 
438 	rxq->lro.last_purge_ticks = now;
439 	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
440 		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
441 			continue;
442 
443 		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
444 		if (now - c->last_pkt_ticks > lro_idle_ticks) {
445 			++rxq->lro.n_drop_idle;
446 			sfxge_lro_drop(rxq, c);
447 		}
448 	}
449 }
450 
451 static void
452 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
453 		struct mbuf *mbuf, struct tcphdr *th)
454 {
455 	struct tcphdr *c_th;
456 
457 	/* Tack the new mbuf onto the chain. */
458 	KASSERT(!mbuf->m_next, ("mbuf already chained"));
459 	c->mbuf_tail->m_next = mbuf;
460 	c->mbuf_tail = mbuf;
461 
462 	/* Increase length appropriately */
463 	c->mbuf->m_pkthdr.len += mbuf->m_len;
464 
465 	/* Update the connection state flags */
466 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
467 		struct ip *iph = c->nh;
468 		iph->ip_len += mbuf->m_len;
469 		c_th = (struct tcphdr *)(iph + 1);
470 	} else {
471 		struct ip6_hdr *iph = c->nh;
472 		iph->ip6_plen += mbuf->m_len;
473 		c_th = (struct tcphdr *)(iph + 1);
474 	}
475 	c_th->th_flags |= (th->th_flags & TH_PUSH);
476 	c->th_last = th;
477 	++st->n_merges;
478 
479 	/* Pass packet up now if another segment could overflow the IP
480 	 * length.
481 	 */
482 	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
483 		sfxge_lro_deliver(st, c);
484 }
485 
486 static void
487 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
488 		struct mbuf *mbuf, void *nh, struct tcphdr *th)
489 {
490 	/* Start the chain */
491 	c->mbuf = mbuf;
492 	c->mbuf_tail = c->mbuf;
493 	c->nh = nh;
494 	c->th_last = th;
495 
496 	mbuf->m_pkthdr.len = mbuf->m_len;
497 
498 	/* Mangle header fields for later processing */
499 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
500 		struct ip *iph = nh;
501 		iph->ip_len = ntohs(iph->ip_len);
502 	} else {
503 		struct ip6_hdr *iph = nh;
504 		iph->ip6_plen = ntohs(iph->ip6_plen);
505 	}
506 }
507 
508 /* Try to merge or otherwise hold or deliver (as appropriate) the
509  * packet buffered for this connection (c->next_buf).  Return a flag
510  * indicating whether the connection is still active for LRO purposes.
511  */
512 static int
513 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
514 {
515 	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
516 	char *eh = c->next_eh;
517 	int data_length, hdr_length, dont_merge;
518 	unsigned th_seq, pkt_length;
519 	struct tcphdr *th;
520 	unsigned now;
521 
522 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
523 		struct ip *iph = c->next_nh;
524 		th = (struct tcphdr *)(iph + 1);
525 		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
526 	} else {
527 		struct ip6_hdr *iph = c->next_nh;
528 		th = (struct tcphdr *)(iph + 1);
529 		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
530 	}
531 
532 	hdr_length = (char *) th + th->th_off * 4 - eh;
533 	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
534 		       hdr_length);
535 	th_seq = ntohl(th->th_seq);
536 	dont_merge = ((data_length <= 0)
537 		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
538 
539 	/* Check for options other than aligned timestamp. */
540 	if (th->th_off != 5) {
541 		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
542 		if (th->th_off == 8 &&
543 		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
544 					(TCPOPT_NOP << 16) |
545 					(TCPOPT_TIMESTAMP << 8) |
546 					TCPOLEN_TIMESTAMP)) {
547 			/* timestamp option -- okay */
548 		} else {
549 			dont_merge = 1;
550 		}
551 	}
552 
553 	if (__predict_false(th_seq != c->next_seq)) {
554 		/* Out-of-order, so start counting again. */
555 		if (c->mbuf != NULL)
556 			sfxge_lro_deliver(&rxq->lro, c);
557 		c->n_in_order_pkts -= lro_loss_packets;
558 		c->next_seq = th_seq + data_length;
559 		++rxq->lro.n_misorder;
560 		goto deliver_buf_out;
561 	}
562 	c->next_seq = th_seq + data_length;
563 
564 	now = ticks;
565 	if (now - c->last_pkt_ticks > lro_idle_ticks) {
566 		++rxq->lro.n_drop_idle;
567 		if (c->mbuf != NULL)
568 			sfxge_lro_deliver(&rxq->lro, c);
569 		sfxge_lro_drop(rxq, c);
570 		return (0);
571 	}
572 	c->last_pkt_ticks = ticks;
573 
574 	if (c->n_in_order_pkts < lro_slow_start_packets) {
575 		/* May be in slow-start, so don't merge. */
576 		++rxq->lro.n_slow_start;
577 		++c->n_in_order_pkts;
578 		goto deliver_buf_out;
579 	}
580 
581 	if (__predict_false(dont_merge)) {
582 		if (c->mbuf != NULL)
583 			sfxge_lro_deliver(&rxq->lro, c);
584 		if (th->th_flags & (TH_FIN | TH_RST)) {
585 			++rxq->lro.n_drop_closed;
586 			sfxge_lro_drop(rxq, c);
587 			return (0);
588 		}
589 		goto deliver_buf_out;
590 	}
591 
592 	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
593 
594 	if (__predict_true(c->mbuf != NULL)) {
595 		/* Remove headers and any padding */
596 		rx_buf->mbuf->m_data += hdr_length;
597 		rx_buf->mbuf->m_len = data_length;
598 
599 		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
600 	} else {
601 		/* Remove any padding */
602 		rx_buf->mbuf->m_len = pkt_length;
603 
604 		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
605 	}
606 
607 	rx_buf->mbuf = NULL;
608 	return (1);
609 
610  deliver_buf_out:
611 	sfxge_rx_deliver(rxq->sc, rx_buf);
612 	return (1);
613 }
614 
615 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
616 			       uint16_t l2_id, void *nh, struct tcphdr *th)
617 {
618 	unsigned bucket = conn_hash & st->conns_mask;
619 	struct sfxge_lro_conn *c;
620 
621 	if (st->conns_n[bucket] >= lro_chain_max) {
622 		++st->n_too_many;
623 		return;
624 	}
625 
626 	if (!TAILQ_EMPTY(&st->free_conns)) {
627 		c = TAILQ_FIRST(&st->free_conns);
628 		TAILQ_REMOVE(&st->free_conns, c, link);
629 	} else {
630 		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
631 		if (c == NULL)
632 			return;
633 		c->mbuf = NULL;
634 		c->next_buf.mbuf = NULL;
635 	}
636 
637 	/* Create the connection tracking data */
638 	++st->conns_n[bucket];
639 	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
640 	c->l2_id = l2_id;
641 	c->conn_hash = conn_hash;
642 	c->source = th->th_sport;
643 	c->dest = th->th_dport;
644 	c->n_in_order_pkts = 0;
645 	c->last_pkt_ticks = *(volatile int *)&ticks;
646 	c->delivered = 0;
647 	++st->n_new_stream;
648 	/* NB. We don't initialise c->next_seq, and it doesn't matter what
649 	 * value it has.  Most likely the next packet received for this
650 	 * connection will not match -- no harm done.
651 	 */
652 }
653 
654 /* Process mbuf and decide whether to dispatch it to the stack now or
655  * later.
656  */
657 static void
658 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
659 {
660 	struct sfxge_softc *sc = rxq->sc;
661 	struct mbuf *m = rx_buf->mbuf;
662 	struct ether_header *eh;
663 	struct sfxge_lro_conn *c;
664 	uint16_t l2_id;
665 	uint16_t l3_proto;
666 	void *nh;
667 	struct tcphdr *th;
668 	uint32_t conn_hash;
669 	unsigned bucket;
670 
671 	/* Get the hardware hash */
672 	conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
673 					    EFX_RX_HASHALG_TOEPLITZ,
674 					    mtod(m, uint8_t *));
675 
676 	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
677 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
678 		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
679 		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
680 			SFXGE_LRO_L2_ID_VLAN;
681 		l3_proto = veh->evl_proto;
682 		nh = veh + 1;
683 	} else {
684 		l2_id = 0;
685 		l3_proto = eh->ether_type;
686 		nh = eh + 1;
687 	}
688 
689 	/* Check whether this is a suitable packet (unfragmented
690 	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
691 	 * length, and compute a hash if necessary.  If not, return.
692 	 */
693 	if (l3_proto == htons(ETHERTYPE_IP)) {
694 		struct ip *iph = nh;
695 
696 		KASSERT(iph->ip_p == IPPROTO_TCP,
697 		    ("IPv4 protocol is not TCP, but packet marker is set"));
698 		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
699 		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
700 			goto deliver_now;
701 		th = (struct tcphdr *)(iph + 1);
702 	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
703 		struct ip6_hdr *iph = nh;
704 
705 		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
706 		    ("IPv6 next header is not TCP, but packet marker is set"));
707 		l2_id |= SFXGE_LRO_L2_ID_IPV6;
708 		th = (struct tcphdr *)(iph + 1);
709 	} else {
710 		goto deliver_now;
711 	}
712 
713 	bucket = conn_hash & rxq->lro.conns_mask;
714 
715 	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
716 		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
717 			continue;
718 		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
719 			continue;
720 		if (c->mbuf != NULL) {
721 			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
722 				struct ip *c_iph, *iph = nh;
723 				c_iph = c->nh;
724 				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
725 				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
726 					continue;
727 			} else {
728 				struct ip6_hdr *c_iph, *iph = nh;
729 				c_iph = c->nh;
730 				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
731 				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
732 					continue;
733 			}
734 		}
735 
736 		/* Re-insert at head of list to reduce lookup time. */
737 		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
738 		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
739 
740 		if (c->next_buf.mbuf != NULL) {
741 			if (!sfxge_lro_try_merge(rxq, c))
742 				goto deliver_now;
743 		} else {
744 			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
745 			    active_link);
746 		}
747 		c->next_buf = *rx_buf;
748 		c->next_eh = eh;
749 		c->next_nh = nh;
750 
751 		rx_buf->mbuf = NULL;
752 		rx_buf->flags = EFX_DISCARD;
753 		return;
754 	}
755 
756 	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
757  deliver_now:
758 	sfxge_rx_deliver(sc, rx_buf);
759 }
760 
761 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
762 {
763 	struct sfxge_lro_state *st = &rxq->lro;
764 	struct sfxge_lro_conn *c;
765 	unsigned t;
766 
767 	while (!LIST_EMPTY(&st->active_conns)) {
768 		c = LIST_FIRST(&st->active_conns);
769 		if (!c->delivered && c->mbuf != NULL)
770 			sfxge_lro_deliver(st, c);
771 		if (sfxge_lro_try_merge(rxq, c)) {
772 			if (c->mbuf != NULL)
773 				sfxge_lro_deliver(st, c);
774 			LIST_REMOVE(c, active_link);
775 		}
776 		c->delivered = 0;
777 	}
778 
779 	t = *(volatile int *)&ticks;
780 	if (__predict_false(t != st->last_purge_ticks))
781 		sfxge_lro_purge_idle(rxq, t);
782 }
783 
784 #else	/* !SFXGE_LRO */
785 
786 static void
787 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
788 {
789 }
790 
791 static void
792 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
793 {
794 }
795 
796 #endif	/* SFXGE_LRO */
797 
798 void
799 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
800 {
801 	struct sfxge_softc *sc = rxq->sc;
802 	int if_capenable = sc->ifnet->if_capenable;
803 	int lro_enabled = if_capenable & IFCAP_LRO;
804 	unsigned int index;
805 	struct sfxge_evq *evq;
806 	unsigned int completed;
807 	unsigned int level;
808 	struct mbuf *m;
809 	struct sfxge_rx_sw_desc *prev = NULL;
810 
811 	index = rxq->index;
812 	evq = sc->evq[index];
813 
814 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
815 
816 	completed = rxq->completed;
817 	while (completed != rxq->pending) {
818 		unsigned int id;
819 		struct sfxge_rx_sw_desc *rx_desc;
820 
821 		id = completed++ & rxq->ptr_mask;
822 		rx_desc = &rxq->queue[id];
823 		m = rx_desc->mbuf;
824 
825 		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
826 			goto discard;
827 
828 		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
829 			goto discard;
830 
831 		/* Read the length from the pseudo header if required */
832 		if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
833 			uint16_t tmp_size;
834 			int rc;
835 			rc = efx_psuedo_hdr_pkt_length_get(sc->enp,
836 							   mtod(m, uint8_t *),
837 							   &tmp_size);
838 			KASSERT(rc == 0, ("cannot get packet length: %d", rc));
839 			rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
840 		}
841 
842 		prefetch_read_many(mtod(m, caddr_t));
843 
844 		switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
845 		case EFX_PKT_IPV4:
846 			if (~if_capenable & IFCAP_RXCSUM)
847 				rx_desc->flags &=
848 				    ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
849 			break;
850 		case EFX_PKT_IPV6:
851 			if (~if_capenable & IFCAP_RXCSUM_IPV6)
852 				rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
853 			break;
854 		case 0:
855 			/* Check for loopback packets */
856 			{
857 				struct ether_header *etherhp;
858 
859 				/*LINTED*/
860 				etherhp = mtod(m, struct ether_header *);
861 
862 				if (etherhp->ether_type ==
863 				    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
864 					EFSYS_PROBE(loopback);
865 
866 					rxq->loopback++;
867 					goto discard;
868 				}
869 			}
870 			break;
871 		default:
872 			KASSERT(B_FALSE,
873 			    ("Rx descriptor with both IPv4 and IPv6 flags"));
874 			goto discard;
875 		}
876 
877 		/* Pass packet up the stack or into LRO (pipelined) */
878 		if (prev != NULL) {
879 			if (lro_enabled &&
880 			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
881 			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
882 				sfxge_lro(rxq, prev);
883 			else
884 				sfxge_rx_deliver(sc, prev);
885 		}
886 		prev = rx_desc;
887 		continue;
888 
889 discard:
890 		/* Return the packet to the pool */
891 		m_free(m);
892 		rx_desc->mbuf = NULL;
893 	}
894 	rxq->completed = completed;
895 
896 	level = rxq->added - rxq->completed;
897 
898 	/* Pass last packet up the stack or into LRO */
899 	if (prev != NULL) {
900 		if (lro_enabled &&
901 		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
902 		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
903 			sfxge_lro(rxq, prev);
904 		else
905 			sfxge_rx_deliver(sc, prev);
906 	}
907 
908 	/*
909 	 * If there are any pending flows and this is the end of the
910 	 * poll then they must be completed.
911 	 */
912 	if (eop)
913 		sfxge_lro_end_of_burst(rxq);
914 
915 	/* Top up the queue if necessary */
916 	if (level < rxq->refill_threshold)
917 		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
918 }
919 
920 static void
921 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
922 {
923 	struct sfxge_rxq *rxq;
924 	struct sfxge_evq *evq;
925 	unsigned int count;
926 	unsigned int retry = 3;
927 
928 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
929 
930 	rxq = sc->rxq[index];
931 	evq = sc->evq[index];
932 
933 	SFXGE_EVQ_LOCK(evq);
934 
935 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
936 	    ("rxq not started"));
937 
938 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
939 
940 	callout_stop(&rxq->refill_callout);
941 
942 	while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
943 		rxq->flush_state = SFXGE_FLUSH_PENDING;
944 
945 		SFXGE_EVQ_UNLOCK(evq);
946 
947 		/* Flush the receive queue */
948 		if (efx_rx_qflush(rxq->common) != 0) {
949 			SFXGE_EVQ_LOCK(evq);
950 			rxq->flush_state = SFXGE_FLUSH_FAILED;
951 			break;
952 		}
953 
954 		count = 0;
955 		do {
956 			/* Spin for 100 ms */
957 			DELAY(100000);
958 
959 			if (rxq->flush_state != SFXGE_FLUSH_PENDING)
960 				break;
961 
962 		} while (++count < 20);
963 
964 		SFXGE_EVQ_LOCK(evq);
965 
966 		if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
967 			/* Flush timeout - neither done nor failed */
968 			log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
969 			    device_get_nameunit(sc->dev), index);
970 			rxq->flush_state = SFXGE_FLUSH_DONE;
971 		}
972 		retry--;
973 	}
974 	if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
975 		log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
976 		    device_get_nameunit(sc->dev), index);
977 		rxq->flush_state = SFXGE_FLUSH_DONE;
978 	}
979 
980 	rxq->pending = rxq->added;
981 	sfxge_rx_qcomplete(rxq, B_TRUE);
982 
983 	KASSERT(rxq->completed == rxq->pending,
984 	    ("rxq->completed != rxq->pending"));
985 
986 	rxq->added = 0;
987 	rxq->pushed = 0;
988 	rxq->pending = 0;
989 	rxq->completed = 0;
990 	rxq->loopback = 0;
991 
992 	/* Destroy the common code receive queue. */
993 	efx_rx_qdestroy(rxq->common);
994 
995 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
996 	    EFX_RXQ_NBUFS(sc->rxq_entries));
997 
998 	SFXGE_EVQ_UNLOCK(evq);
999 }
1000 
1001 static int
1002 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1003 {
1004 	struct sfxge_rxq *rxq;
1005 	efsys_mem_t *esmp;
1006 	struct sfxge_evq *evq;
1007 	int rc;
1008 
1009 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1010 
1011 	rxq = sc->rxq[index];
1012 	esmp = &rxq->mem;
1013 	evq = sc->evq[index];
1014 
1015 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1016 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1017 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1018 	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1019 
1020 	/* Program the buffer table. */
1021 	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1022 	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1023 		return (rc);
1024 
1025 	/* Create the common code receive queue. */
1026 	if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1027 	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1028 	    &rxq->common)) != 0)
1029 		goto fail;
1030 
1031 	SFXGE_EVQ_LOCK(evq);
1032 
1033 	/* Enable the receive queue. */
1034 	efx_rx_qenable(rxq->common);
1035 
1036 	rxq->init_state = SFXGE_RXQ_STARTED;
1037 	rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1038 
1039 	/* Try to fill the queue from the pool. */
1040 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1041 
1042 	SFXGE_EVQ_UNLOCK(evq);
1043 
1044 	return (0);
1045 
1046 fail:
1047 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1048 	    EFX_RXQ_NBUFS(sc->rxq_entries));
1049 	return (rc);
1050 }
1051 
1052 void
1053 sfxge_rx_stop(struct sfxge_softc *sc)
1054 {
1055 	int index;
1056 
1057 	efx_mac_filter_default_rxq_clear(sc->enp);
1058 
1059 	/* Stop the receive queue(s) */
1060 	index = sc->rxq_count;
1061 	while (--index >= 0)
1062 		sfxge_rx_qstop(sc, index);
1063 
1064 	sc->rx_prefix_size = 0;
1065 	sc->rx_buffer_size = 0;
1066 
1067 	efx_rx_fini(sc->enp);
1068 }
1069 
1070 int
1071 sfxge_rx_start(struct sfxge_softc *sc)
1072 {
1073 	struct sfxge_intr *intr;
1074 	const efx_nic_cfg_t *encp;
1075 	size_t hdrlen, align, reserved;
1076 	int index;
1077 	int rc;
1078 
1079 	intr = &sc->intr;
1080 
1081 	/* Initialize the common code receive module. */
1082 	if ((rc = efx_rx_init(sc->enp)) != 0)
1083 		return (rc);
1084 
1085 	encp = efx_nic_cfg_get(sc->enp);
1086 	sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1087 
1088 	/* Calculate the receive packet buffer size. */
1089 	sc->rx_prefix_size = encp->enc_rx_prefix_size;
1090 
1091 	/* Ensure IP headers are 32bit aligned */
1092 	hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1093 	sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1094 
1095 	sc->rx_buffer_size += sc->rx_buffer_align;
1096 
1097 	/* Align end of packet buffer for RX DMA end padding */
1098 	align = MAX(1, encp->enc_rx_buf_align_end);
1099 	EFSYS_ASSERT(ISP2(align));
1100 	sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1101 
1102 	/*
1103 	 * Standard mbuf zones only guarantee pointer-size alignment;
1104 	 * we need extra space to align to the cache line
1105 	 */
1106 	reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1107 
1108 	/* Select zone for packet buffers */
1109 	if (reserved <= MCLBYTES)
1110 		sc->rx_cluster_size = MCLBYTES;
1111 	else if (reserved <= MJUMPAGESIZE)
1112 		sc->rx_cluster_size = MJUMPAGESIZE;
1113 	else if (reserved <= MJUM9BYTES)
1114 		sc->rx_cluster_size = MJUM9BYTES;
1115 	else
1116 		sc->rx_cluster_size = MJUM16BYTES;
1117 
1118 	/*
1119 	 * Set up the scale table.  Enable all hash types and hash insertion.
1120 	 */
1121 	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1122 		sc->rx_indir_table[index] = index % sc->rxq_count;
1123 	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1124 				       SFXGE_RX_SCALE_MAX)) != 0)
1125 		goto fail;
1126 	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1127 	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1128 	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1129 
1130 	if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1131 				       sizeof(toep_key))) != 0)
1132 		goto fail;
1133 
1134 	/* Start the receive queue(s). */
1135 	for (index = 0; index < sc->rxq_count; index++) {
1136 		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1137 			goto fail2;
1138 	}
1139 
1140 	rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1141 					    sc->intr.n_alloc > 1);
1142 	if (rc != 0)
1143 		goto fail3;
1144 
1145 	return (0);
1146 
1147 fail3:
1148 fail2:
1149 	while (--index >= 0)
1150 		sfxge_rx_qstop(sc, index);
1151 
1152 fail:
1153 	efx_rx_fini(sc->enp);
1154 
1155 	return (rc);
1156 }
1157 
1158 #ifdef SFXGE_LRO
1159 
1160 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1161 {
1162 	struct sfxge_lro_state *st = &rxq->lro;
1163 	unsigned i;
1164 
1165 	st->conns_mask = lro_table_size - 1;
1166 	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1167 		("lro_table_size must be a power of 2"));
1168 	st->sc = rxq->sc;
1169 	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1170 			   M_SFXGE, M_WAITOK);
1171 	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1172 			     M_SFXGE, M_WAITOK);
1173 	for (i = 0; i <= st->conns_mask; ++i) {
1174 		TAILQ_INIT(&st->conns[i]);
1175 		st->conns_n[i] = 0;
1176 	}
1177 	LIST_INIT(&st->active_conns);
1178 	TAILQ_INIT(&st->free_conns);
1179 }
1180 
1181 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1182 {
1183 	struct sfxge_lro_state *st = &rxq->lro;
1184 	struct sfxge_lro_conn *c;
1185 	unsigned i;
1186 
1187 	/* Return cleanly if sfxge_lro_init() has not been called. */
1188 	if (st->conns == NULL)
1189 		return;
1190 
1191 	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1192 
1193 	for (i = 0; i <= st->conns_mask; ++i) {
1194 		while (!TAILQ_EMPTY(&st->conns[i])) {
1195 			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1196 			sfxge_lro_drop(rxq, c);
1197 		}
1198 	}
1199 
1200 	while (!TAILQ_EMPTY(&st->free_conns)) {
1201 		c = TAILQ_FIRST(&st->free_conns);
1202 		TAILQ_REMOVE(&st->free_conns, c, link);
1203 		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1204 		free(c, M_SFXGE);
1205 	}
1206 
1207 	free(st->conns_n, M_SFXGE);
1208 	free(st->conns, M_SFXGE);
1209 	st->conns = NULL;
1210 }
1211 
1212 #else
1213 
1214 static void
1215 sfxge_lro_init(struct sfxge_rxq *rxq)
1216 {
1217 }
1218 
1219 static void
1220 sfxge_lro_fini(struct sfxge_rxq *rxq)
1221 {
1222 }
1223 
1224 #endif	/* SFXGE_LRO */
1225 
1226 static void
1227 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1228 {
1229 	struct sfxge_rxq *rxq;
1230 
1231 	rxq = sc->rxq[index];
1232 
1233 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1234 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1235 
1236 	/* Free the context array and the flow table. */
1237 	free(rxq->queue, M_SFXGE);
1238 	sfxge_lro_fini(rxq);
1239 
1240 	/* Release DMA memory. */
1241 	sfxge_dma_free(&rxq->mem);
1242 
1243 	sc->rxq[index] = NULL;
1244 
1245 	free(rxq, M_SFXGE);
1246 }
1247 
1248 static int
1249 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1250 {
1251 	struct sfxge_rxq *rxq;
1252 	struct sfxge_evq *evq;
1253 	efsys_mem_t *esmp;
1254 	int rc;
1255 
1256 	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1257 
1258 	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1259 	rxq->sc = sc;
1260 	rxq->index = index;
1261 	rxq->entries = sc->rxq_entries;
1262 	rxq->ptr_mask = rxq->entries - 1;
1263 	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1264 
1265 	sc->rxq[index] = rxq;
1266 	esmp = &rxq->mem;
1267 
1268 	evq = sc->evq[index];
1269 
1270 	/* Allocate and zero DMA space. */
1271 	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1272 		return (rc);
1273 
1274 	/* Allocate buffer table entries. */
1275 	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1276 				 &rxq->buf_base_id);
1277 
1278 	/* Allocate the context array and the flow table. */
1279 	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1280 	    M_SFXGE, M_WAITOK | M_ZERO);
1281 	sfxge_lro_init(rxq);
1282 
1283 	callout_init(&rxq->refill_callout, 1);
1284 
1285 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1286 
1287 	return (0);
1288 }
1289 
1290 static const struct {
1291 	const char *name;
1292 	size_t offset;
1293 } sfxge_rx_stats[] = {
1294 #define	SFXGE_RX_STAT(name, member) \
1295 	{ #name, offsetof(struct sfxge_rxq, member) }
1296 #ifdef SFXGE_LRO
1297 	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1298 	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1299 	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1300 	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1301 	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1302 	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1303 	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1304 	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1305 #endif
1306 };
1307 
1308 static int
1309 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1310 {
1311 	struct sfxge_softc *sc = arg1;
1312 	unsigned int id = arg2;
1313 	unsigned int sum, index;
1314 
1315 	/* Sum across all RX queues */
1316 	sum = 0;
1317 	for (index = 0; index < sc->rxq_count; index++)
1318 		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1319 					 sfxge_rx_stats[id].offset);
1320 
1321 	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1322 }
1323 
1324 static void
1325 sfxge_rx_stat_init(struct sfxge_softc *sc)
1326 {
1327 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1328 	struct sysctl_oid_list *stat_list;
1329 	unsigned int id;
1330 
1331 	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1332 
1333 	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1334 		SYSCTL_ADD_PROC(
1335 			ctx, stat_list,
1336 			OID_AUTO, sfxge_rx_stats[id].name,
1337 			CTLTYPE_UINT|CTLFLAG_RD,
1338 			sc, id, sfxge_rx_stat_handler, "IU",
1339 			"");
1340 	}
1341 }
1342 
1343 void
1344 sfxge_rx_fini(struct sfxge_softc *sc)
1345 {
1346 	int index;
1347 
1348 	index = sc->rxq_count;
1349 	while (--index >= 0)
1350 		sfxge_rx_qfini(sc, index);
1351 
1352 	sc->rxq_count = 0;
1353 }
1354 
1355 int
1356 sfxge_rx_init(struct sfxge_softc *sc)
1357 {
1358 	struct sfxge_intr *intr;
1359 	int index;
1360 	int rc;
1361 
1362 #ifdef SFXGE_LRO
1363 	if (!ISP2(lro_table_size)) {
1364 		log(LOG_ERR, "%s=%u must be power of 2",
1365 		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1366 		rc = EINVAL;
1367 		goto fail_lro_table_size;
1368 	}
1369 
1370 	if (lro_idle_ticks == 0)
1371 		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1372 #endif
1373 
1374 	intr = &sc->intr;
1375 
1376 	sc->rxq_count = intr->n_alloc;
1377 
1378 	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1379 	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1380 
1381 	/* Initialize the receive queue(s) - one per interrupt. */
1382 	for (index = 0; index < sc->rxq_count; index++) {
1383 		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1384 			goto fail;
1385 	}
1386 
1387 	sfxge_rx_stat_init(sc);
1388 
1389 	return (0);
1390 
1391 fail:
1392 	/* Tear down the receive queue(s). */
1393 	while (--index >= 0)
1394 		sfxge_rx_qfini(sc, index);
1395 
1396 	sc->rxq_count = 0;
1397 
1398 #ifdef SFXGE_LRO
1399 fail_lro_table_size:
1400 #endif
1401 	return (rc);
1402 }
1403