xref: /illumos-gate/usr/src/uts/common/io/sfxge/sfxge_rx.c (revision ed093b41a93e8563e6e1e5dae0768dda2a7bcc27)
1 /*
2  * Copyright (c) 2008-2016 Solarflare Communications Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  *    this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *    this list of conditions and the following disclaimer in the documentation
12  *    and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are
27  * those of the authors and should not be interpreted as representing official
28  * policies, either expressed or implied, of the FreeBSD Project.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/sysmacros.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/atomic.h>
36 #include <sys/stream.h>
37 #include <sys/strsun.h>
38 #include <sys/strsubr.h>
39 #include <sys/strft.h>
40 #include <sys/ksynch.h>
41 #include <sys/ethernet.h>
42 #include <sys/crc32.h>
43 #include <sys/pattr.h>
44 #include <sys/cpu.h>
45 
46 #include <sys/ethernet.h>
47 #include <inet/ip.h>
48 
49 #include <netinet/in.h>
50 #include <netinet/ip.h>
51 #include <netinet/tcp.h>
52 
53 #include "sfxge.h"
54 
55 #include "efx.h"
56 
57 /* RXQ flush response timeout (in microseconds) */
58 #define	SFXGE_RX_QFLUSH_USEC	(2000000)
59 
60 /* RXQ flush tries in the case of failure */
61 #define	SFXGE_RX_QFLUSH_TRIES	(5)
62 
63 /* RXQ default packet buffer preallocation (number of packet buffers) */
64 #define	SFXGE_RX_QPREALLOC	(0)
65 
66 /* Receive packet DMA attributes */
67 static ddi_device_acc_attr_t sfxge_rx_packet_devacc = {
68 
69 	DDI_DEVICE_ATTR_V0,	/* devacc_attr_version */
70 	DDI_NEVERSWAP_ACC,	/* devacc_attr_endian_flags */
71 	DDI_STRICTORDER_ACC	/* devacc_attr_dataorder */
72 };
73 
74 static ddi_dma_attr_t sfxge_rx_packet_dma_attr = {
75 	DMA_ATTR_V0,		/* dma_attr_version	*/
76 	0,			/* dma_attr_addr_lo	*/
77 	0xffffffffffffffffull,	/* dma_attr_addr_hi	*/
78 	0xffffffffffffffffull,	/* dma_attr_count_max	*/
79 	SFXGE_CPU_CACHE_SIZE,	/* dma_attr_align	*/
80 	0xffffffff,		/* dma_attr_burstsizes	*/
81 	1,			/* dma_attr_minxfer	*/
82 	0xffffffffffffffffull,	/* dma_attr_maxxfer	*/
83 	0xffffffffffffffffull,	/* dma_attr_seg		*/
84 	1,			/* dma_attr_sgllen	*/
85 	1,			/* dma_attr_granular	*/
86 	0			/* dma_attr_flags	*/
87 };
88 
89 /* Receive queue DMA attributes */
90 static ddi_device_acc_attr_t sfxge_rxq_devacc = {
91 
92 	DDI_DEVICE_ATTR_V0,	/* devacc_attr_version */
93 	DDI_NEVERSWAP_ACC,	/* devacc_attr_endian_flags */
94 	DDI_STRICTORDER_ACC	/* devacc_attr_dataorder */
95 };
96 
97 static ddi_dma_attr_t sfxge_rxq_dma_attr = {
98 	DMA_ATTR_V0,		/* dma_attr_version	*/
99 	0,			/* dma_attr_addr_lo	*/
100 	0xffffffffffffffffull,	/* dma_attr_addr_hi	*/
101 	0xffffffffffffffffull,	/* dma_attr_count_max	*/
102 	EFX_BUF_SIZE,		/* dma_attr_align	*/
103 	0xffffffff,		/* dma_attr_burstsizes	*/
104 	1,			/* dma_attr_minxfer	*/
105 	0xffffffffffffffffull,	/* dma_attr_maxxfer	*/
106 	0xffffffffffffffffull,	/* dma_attr_seg		*/
107 	1,			/* dma_attr_sgllen	*/
108 	1,			/* dma_attr_granular	*/
109 	0			/* dma_attr_flags	*/
110 };
111 
112 /* Forward declaration */
113 static void sfxge_rx_qpreallocate(sfxge_rxq_t *srp, int nprealloc);
114 
115 static int
116 sfxge_rx_packet_ctor(void *buf, void *arg, int kmflags)
117 {
118 	sfxge_rx_packet_t *srpp = buf;
119 	sfxge_t *sp = arg;
120 	dev_info_t *dip = sp->s_dip;
121 	int err;
122 
123 	ASSERT3U(sizeof (srpp->__srp_u1.__srp_s1), <=,
124 	    sizeof (srpp->__srp_u1.__srp_pad));
125 	ASSERT3U(sizeof (srpp->__srp_u2.__srp_s2), <=,
126 	    sizeof (srpp->__srp_u2.__srp_pad));
127 
128 	bzero(buf, sizeof (sfxge_rx_packet_t));
129 
130 	/* Allocate a DMA handle */
131 	err = ddi_dma_alloc_handle(dip, &sfxge_rx_packet_dma_attr,
132 	    (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT,
133 	    NULL, &(srpp->srp_dma_handle));
134 	if (err != DDI_SUCCESS)
135 		goto fail1;
136 
137 	return (0);
138 
139 fail1:
140 	DTRACE_PROBE1(fail1, int, err);
141 
142 	SFXGE_OBJ_CHECK(srpp, sfxge_rx_packet_t);
143 
144 	return (-1);
145 }
146 
147 static void
148 sfxge_rx_packet_dtor(void *buf, void *arg)
149 {
150 	sfxge_rx_packet_t *srpp = buf;
151 
152 	_NOTE(ARGUNUSED(arg))
153 
154 	/* Free the DMA handle */
155 	ddi_dma_free_handle(&(srpp->srp_dma_handle));
156 	srpp->srp_dma_handle = NULL;
157 
158 	SFXGE_OBJ_CHECK(srpp, sfxge_rx_packet_t);
159 }
160 
161 static int
162 sfxge_rx_qctor(void *buf, void *arg, int kmflags)
163 {
164 	sfxge_rxq_t *srp = buf;
165 	efsys_mem_t *esmp = &(srp->sr_mem);
166 	sfxge_t *sp = arg;
167 	sfxge_dma_buffer_attr_t dma_attr;
168 	sfxge_rx_fpp_t *srfppp;
169 	int nprealloc;
170 	unsigned int id;
171 	int rc;
172 
173 	/* Compile-time structure layout checks */
174 	EFX_STATIC_ASSERT(sizeof (srp->__sr_u1.__sr_s1) <=
175 	    sizeof (srp->__sr_u1.__sr_pad));
176 	EFX_STATIC_ASSERT(sizeof (srp->__sr_u2.__sr_s2) <=
177 	    sizeof (srp->__sr_u2.__sr_pad));
178 	EFX_STATIC_ASSERT(sizeof (srp->__sr_u3.__sr_s3) <=
179 	    sizeof (srp->__sr_u3.__sr_pad));
180 
181 	bzero(buf, sizeof (sfxge_rxq_t));
182 
183 	srp->sr_sp = sp;
184 
185 	dma_attr.sdba_dip	 = sp->s_dip;
186 	dma_attr.sdba_dattrp	 = &sfxge_rxq_dma_attr;
187 	dma_attr.sdba_callback	 = DDI_DMA_SLEEP;
188 	dma_attr.sdba_length	 = EFX_RXQ_SIZE(sp->s_rxq_size);
189 	dma_attr.sdba_memflags	 = DDI_DMA_CONSISTENT;
190 	dma_attr.sdba_devaccp	 = &sfxge_rxq_devacc;
191 	dma_attr.sdba_bindflags	 = DDI_DMA_READ | DDI_DMA_CONSISTENT;
192 	dma_attr.sdba_maxcookies = 1;
193 	dma_attr.sdba_zeroinit	 = B_FALSE;
194 
195 	if ((rc = sfxge_dma_buffer_create(esmp, &dma_attr)) != 0)
196 		goto fail1;
197 
198 	/* Allocate some buffer table entries */
199 	if ((rc = sfxge_sram_buf_tbl_alloc(sp, EFX_RXQ_NBUFS(sp->s_rxq_size),
200 	    &(srp->sr_id))) != 0)
201 		goto fail2;
202 
203 	/* Allocate the context array */
204 	if ((srp->sr_srpp = kmem_zalloc(sizeof (sfxge_rx_packet_t *) *
205 	    sp->s_rxq_size, kmflags)) == NULL) {
206 		rc = ENOMEM;
207 		goto fail3;
208 	}
209 
210 	/* Allocate the flow table */
211 	if ((srp->sr_flow = kmem_zalloc(sizeof (sfxge_rx_flow_t) *
212 	    SFXGE_MAX_FLOW, kmflags)) == NULL) {
213 		rc = ENOMEM;
214 		goto fail4;
215 	}
216 
217 	srp->sr_srfpp = &(srp->sr_srfp);
218 	srp->sr_rto = drv_usectohz(200000);
219 
220 	srp->sr_mpp = &(srp->sr_mp);
221 
222 	/* Initialize the free packet pool */
223 	srfppp = &(srp->sr_fpp);
224 	if ((srfppp->srfpp_putp = kmem_zalloc(SFXGE_CPU_CACHE_SIZE *
225 	    SFXGE_RX_FPP_NSLOTS, kmflags)) == NULL) {
226 		rc = ENOMEM;
227 		goto fail5;
228 	}
229 	for (id = 0; id < SFXGE_RX_FPP_NSLOTS; id++) {
230 		sfxge_rx_fpp_putlist_t *putp;
231 		size_t off;
232 
233 		off = id * SFXGE_CPU_CACHE_SIZE;
234 		putp = (void *)(srfppp->srfpp_putp + off);
235 
236 		putp->srfpl_putp = NULL;
237 		putp->srfpl_putpp = &(putp->srfpl_putp);
238 		mutex_init(&(putp->srfpl_lock), NULL, MUTEX_DRIVER,
239 		    DDI_INTR_PRI(sp->s_intr.si_intr_pri));
240 	}
241 
242 	cv_init(&(srp->sr_flush_kv), NULL, CV_DRIVER, NULL);
243 
244 	/* Preallocate some packets on the free packet pool */
245 	nprealloc = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
246 	    DDI_PROP_DONTPASS, "rx_prealloc_pkt_buffers", SFXGE_RX_QPREALLOC);
247 	sfxge_rx_qpreallocate(srp, nprealloc);
248 
249 
250 	return (0);
251 
252 fail5:
253 	DTRACE_PROBE(fail5);
254 
255 	srp->sr_mpp = NULL;
256 
257 	srp->sr_rto = 0;
258 	srp->sr_srfpp = NULL;
259 
260 	/* Free the flow table */
261 	kmem_free(srp->sr_flow, sizeof (sfxge_rx_flow_t) *
262 	    SFXGE_MAX_FLOW);
263 	srp->sr_flow = NULL;
264 
265 fail4:
266 	DTRACE_PROBE(fail4);
267 
268 	/* Free the context array */
269 	kmem_free(srp->sr_srpp, sizeof (sfxge_rx_packet_t *) *
270 	    sp->s_rxq_size);
271 	srp->sr_srpp = NULL;
272 
273 fail3:
274 	DTRACE_PROBE(fail3);
275 
276 	/* Free the buffer table entries */
277 	sfxge_sram_buf_tbl_free(sp, srp->sr_id,
278 	    EFX_RXQ_NBUFS(sp->s_rxq_size));
279 	srp->sr_id = 0;
280 
281 fail2:
282 	DTRACE_PROBE(fail2);
283 	/* Remove dma setup */
284 	sfxge_dma_buffer_destroy(esmp);
285 
286 fail1:
287 	DTRACE_PROBE1(fail1, int, rc);
288 
289 	srp->sr_sp = NULL;
290 
291 	SFXGE_OBJ_CHECK(srp, sfxge_rxq_t);
292 
293 	return (-1);
294 }
295 
296 static void
297 sfxge_rx_qdtor(void *buf, void *arg)
298 {
299 	sfxge_rxq_t *srp = buf;
300 	efsys_mem_t *esmp = &(srp->sr_mem);
301 	sfxge_t *sp = srp->sr_sp;
302 	sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
303 	unsigned int id;
304 
305 	_NOTE(ARGUNUSED(arg))
306 
307 	cv_destroy(&(srp->sr_flush_kv));
308 
309 	/* Tear down the free packet pool */
310 	for (id = 0; id < SFXGE_RX_FPP_NSLOTS; id++) {
311 		sfxge_rx_fpp_putlist_t *putp;
312 		size_t off;
313 
314 		off = id * SFXGE_CPU_CACHE_SIZE;
315 		putp = (void *)(srfppp->srfpp_putp + off);
316 
317 		putp->srfpl_putpp = NULL;
318 		mutex_destroy(&(putp->srfpl_lock));
319 
320 		SFXGE_OBJ_CHECK(putp, sfxge_rx_fpp_putlist_t);
321 	}
322 	kmem_free(srfppp->srfpp_putp, SFXGE_CPU_CACHE_SIZE *
323 	    SFXGE_RX_FPP_NSLOTS);
324 	srfppp->srfpp_putp = NULL;
325 
326 	srp->sr_mpp = NULL;
327 
328 	srp->sr_rto = 0;
329 	srp->sr_srfpp = NULL;
330 
331 	/* Free the flow table */
332 	kmem_free(srp->sr_flow, sizeof (sfxge_rx_flow_t) *
333 	    SFXGE_MAX_FLOW);
334 	srp->sr_flow = NULL;
335 
336 	/* Free the context array */
337 	kmem_free(srp->sr_srpp, sizeof (sfxge_rx_packet_t *) *
338 	    sp->s_rxq_size);
339 	srp->sr_srpp = NULL;
340 
341 	/* Free the buffer table entries */
342 	sfxge_sram_buf_tbl_free(sp, srp->sr_id,
343 	    EFX_RXQ_NBUFS(sp->s_rxq_size));
344 	srp->sr_id = 0;
345 
346 	/* Tear down dma setup */
347 	sfxge_dma_buffer_destroy(esmp);
348 
349 	SFXGE_OBJ_CHECK(srp, sfxge_rxq_t);
350 }
351 
352 /* Note: This function takes ownership of *srpp. */
353 static inline void
354 sfxge_rx_qfpp_put(sfxge_rxq_t *srp, sfxge_rx_packet_t *srpp)
355 {
356 	sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
357 	mblk_t *mp = srpp->srp_mp;
358 	unsigned int id;
359 	size_t off;
360 	sfxge_rx_fpp_putlist_t *putp;
361 
362 	ASSERT3P(mp->b_next, ==, NULL);
363 	ASSERT3P(mp->b_prev, ==, NULL);
364 
365 	id = CPU->cpu_seqid & SFXGE_RX_FPP_MASK;
366 	off = id * SFXGE_CPU_CACHE_SIZE;
367 
368 	ASSERT3P(srpp->srp_putp, ==, srfppp->srfpp_putp);
369 	putp = (void *)(srpp->srp_putp + off);
370 
371 	mutex_enter(&(putp->srfpl_lock));
372 	putp->srfpl_count++;
373 	*putp->srfpl_putpp = mp;
374 	putp->srfpl_putpp = &(mp->b_next);
375 	mutex_exit(&(putp->srfpl_lock));
376 }
377 
378 static unsigned int
379 sfxge_rx_qfpp_swizzle(sfxge_rxq_t *srp)
380 {
381 	sfxge_t *sp = srp->sr_sp;
382 	unsigned int index = srp->sr_index;
383 	sfxge_evq_t *sep = sp->s_sep[index];
384 	sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
385 	unsigned int start;
386 	unsigned int id;
387 	mblk_t *p;
388 	mblk_t **pp;
389 	unsigned int count;
390 	unsigned int loaned;
391 
392 	ASSERT(mutex_owned(&(sep->se_lock)));
393 
394 	/* We want to access the put list for the current CPU last */
395 	id = start = (CPU->cpu_seqid + 1) & SFXGE_RX_FPP_MASK;
396 
397 	do {
398 		sfxge_rx_fpp_putlist_t *putp;
399 		size_t off;
400 
401 		off = id * SFXGE_CPU_CACHE_SIZE;
402 		id  = (id + 1) & SFXGE_RX_FPP_MASK;
403 
404 		putp = (void *)(srfppp->srfpp_putp + off);
405 
406 		/* Acquire the put list */
407 		mutex_enter(&(putp->srfpl_lock));
408 
409 		p = putp->srfpl_putp;
410 		pp = putp->srfpl_putpp;
411 		count = putp->srfpl_count;
412 
413 		putp->srfpl_putp = NULL;
414 		putp->srfpl_putpp = &(putp->srfpl_putp);
415 		putp->srfpl_count = 0;
416 
417 		mutex_exit(&(putp->srfpl_lock));
418 
419 		if (p == NULL)
420 			continue;
421 
422 		/* Add the list to the head of the get list */
423 		*pp = srfppp->srfpp_get;
424 		srfppp->srfpp_get = p;
425 
426 		/* Adjust the counters */
427 		ASSERT3U(srfppp->srfpp_loaned, >=, count);
428 		srfppp->srfpp_loaned -= count;
429 		srfppp->srfpp_count += count;
430 
431 #if 0
432 		/* NOTE: this probe is disabled because it is expensive!! */
433 		DTRACE_PROBE2(count,
434 		    unsigned int, (id - 1) & SFXGE_RX_FPP_MASK,
435 		    unsigned int, count);
436 #endif
437 
438 	} while (id != start);
439 
440 	/* Return the number of packets yet to appear in the put list */
441 	loaned = srfppp->srfpp_loaned;
442 
443 
444 	return (loaned);
445 }
446 
447 
448 #define	DB_FRTNP(mp)	((mp)->b_datap->db_frtnp)
449 
450 static void
451 sfxge_rx_qfpp_empty(sfxge_rxq_t *srp)
452 {
453 	sfxge_t *sp = srp->sr_sp;
454 	unsigned int index = srp->sr_index;
455 	sfxge_evq_t *sep = sp->s_sep[index];
456 	sfxge_rx_fpp_t *srfppp;
457 	mblk_t *mp;
458 
459 	mutex_enter(&(sep->se_lock));
460 	srfppp = &(srp->sr_fpp);
461 
462 	/* Swizzle put list to get list */
463 	(void) sfxge_rx_qfpp_swizzle(srp);
464 	ASSERT3U(srfppp->srfpp_loaned, ==, 0);
465 
466 	mp = srfppp->srfpp_get;
467 	srfppp->srfpp_get = NULL;
468 
469 	/* Free the remainder */
470 	while (mp != NULL) {
471 		mblk_t *next;
472 		frtn_t *freep;
473 		sfxge_rx_packet_t *srpp;
474 
475 		next = mp->b_next;
476 		mp->b_next = NULL;
477 
478 		ASSERT3U(srfppp->srfpp_count, >, 0);
479 		srfppp->srfpp_count--;
480 
481 		freep = DB_FRTNP(mp);
482 		/*
483 		 * ASSERT3P(freep->free_func, ==, sfxge_rx_qpacket_free);
484 		 *   is implied by srpp test below
485 		 */
486 		/*LINTED*/
487 		srpp = (sfxge_rx_packet_t *)(freep->free_arg);
488 		ASSERT3P(srpp->srp_mp, ==, mp);
489 		ASSERT3P(mp->b_cont, ==, NULL);
490 		srpp->srp_recycle = B_FALSE;
491 
492 		freeb(mp);
493 
494 		mp = next;
495 	}
496 	ASSERT3U(srfppp->srfpp_count, ==, 0);
497 
498 	srfppp->srfpp_min = 0;
499 
500 	mutex_exit(&(sep->se_lock));
501 }
502 
503 /*
504  * This is an estimate of all memory consumed per RX packet
505  * it can be inaccurate but but sp->s_rx_pkt_mem_alloc mustn't drift
506  */
507 static uint64_t
508 sfxge_rx_pkt_mem_approx(const sfxge_rx_packet_t *srpp)
509 {
510 	return (srpp->srp_mblksize + sizeof (mblk_t) + sizeof (dblk_t) +
511 	    sizeof (sfxge_rx_packet_t));
512 }
513 
514 static void
515 sfxge_rx_qpacket_destroy(sfxge_rxq_t *srp, sfxge_rx_packet_t *srpp)
516 {
517 	sfxge_t *sp = srp->sr_sp;
518 	int64_t delta = sfxge_rx_pkt_mem_approx(srpp);
519 
520 	ASSERT(!(srpp->srp_recycle));
521 	ASSERT3P(srpp->srp_mp, ==, NULL);
522 
523 	srpp->srp_off = 0;
524 	srpp->srp_thp = NULL;
525 	srpp->srp_iphp = NULL;
526 	srpp->srp_etherhp = NULL;
527 	srpp->srp_size = 0;
528 	srpp->srp_flags = 0;
529 
530 	bzero(&(srpp->srp_free), sizeof (frtn_t));
531 
532 	srpp->srp_mblksize = 0;
533 	srpp->srp_base = NULL;
534 
535 	/* Unbind the DMA memory from the DMA handle */
536 	srpp->srp_addr = 0;
537 	(void) ddi_dma_unbind_handle(srpp->srp_dma_handle);
538 
539 	/* Free the DMA memory */
540 	srpp->srp_base = NULL;
541 	ddi_dma_mem_free(&(srpp->srp_acc_handle));
542 	srpp->srp_acc_handle = NULL;
543 
544 	srpp->srp_putp = NULL;
545 	srpp->srp_srp = NULL;
546 
547 	kmem_cache_free(sp->s_rpc, srpp);
548 	if (sp->s_rx_pkt_mem_max)
549 		atomic_add_64(&sp->s_rx_pkt_mem_alloc, -delta);
550 }
551 
552 static void
553 sfxge_rx_qpacket_free(void *arg)
554 {
555 	sfxge_rx_packet_t *srpp = arg;
556 	sfxge_rxq_t *srp = srpp->srp_srp;
557 
558 	/*
559 	 * WARNING "man -s 9f esballoc"  states:
560 	 * => runs sync from the thread calling freeb()
561 	 * => must not sleep, or access data structures that could be freed
562 	 */
563 
564 	/* Check whether we want to recycle the receive packets */
565 	if (srpp->srp_recycle) {
566 		frtn_t *freep;
567 		mblk_t *mp;
568 		size_t size;
569 
570 		freep = &(srpp->srp_free);
571 		ASSERT3P(freep->free_func, ==, sfxge_rx_qpacket_free);
572 		ASSERT3P(freep->free_arg, ==, (caddr_t)srpp);
573 
574 		/*
575 		 * Allocate a matching mblk_t before the current one is
576 		 * freed.
577 		 */
578 		size = srpp->srp_mblksize;
579 
580 		if ((mp = desballoc(srpp->srp_base, size, BPRI_HI,
581 		    freep)) != NULL) {
582 			srpp->srp_mp = mp;
583 
584 			/* NORMAL recycled case */
585 			sfxge_rx_qfpp_put(srp, srpp);
586 			return;
587 		}
588 	}
589 
590 	srpp->srp_mp = NULL;
591 
592 	sfxge_rx_qpacket_destroy(srp, srpp);
593 }
594 
595 static sfxge_rx_packet_t *
596 sfxge_rx_qpacket_create(sfxge_rxq_t *srp)
597 {
598 	sfxge_t *sp = srp->sr_sp;
599 	sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
600 	sfxge_rx_packet_t *srpp;
601 	size_t size;
602 	caddr_t base;
603 	size_t unit;
604 	ddi_dma_cookie_t dmac;
605 	unsigned int ncookies;
606 	frtn_t *freep;
607 	mblk_t *mp;
608 	int err;
609 	int rc;
610 
611 	size = sp->s_rx_buffer_size;
612 
613 	if (sp->s_rx_pkt_mem_max &&
614 	    (sp->s_rx_pkt_mem_alloc + size >= sp->s_rx_pkt_mem_max)) {
615 		DTRACE_PROBE(rx_pkt_mem_max);
616 		srp->sr_kstat.srk_rx_pkt_mem_limit++;
617 		return (NULL);
618 	}
619 
620 	/* Allocate a new packet */
621 	if ((srpp = kmem_cache_alloc(sp->s_rpc, KM_NOSLEEP)) == NULL) {
622 		srp->sr_kstat.srk_kcache_alloc_nomem++;
623 		rc = ENOMEM;
624 		goto fail1;
625 	}
626 
627 	srpp->srp_srp = srp;
628 	srpp->srp_putp = srfppp->srfpp_putp;
629 
630 	/* Allocate some DMA memory */
631 	err = ddi_dma_mem_alloc(srpp->srp_dma_handle, size,
632 	    &sfxge_rx_packet_devacc, DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
633 	    NULL, &base, &unit, &(srpp->srp_acc_handle));
634 	switch (err) {
635 	case DDI_SUCCESS:
636 		break;
637 
638 	case DDI_FAILURE:
639 		srp->sr_kstat.srk_dma_alloc_nomem++;
640 		rc = ENOMEM;
641 		goto fail2;
642 
643 	default:
644 		srp->sr_kstat.srk_dma_alloc_fail++;
645 		rc = EFAULT;
646 		goto fail2;
647 	}
648 
649 	/* Adjust the buffer to align the start of the DMA area correctly */
650 	base += sp->s_rx_buffer_align;
651 	size -= sp->s_rx_buffer_align;
652 
653 	/* Bind the DMA memory to the DMA handle */
654 	err = ddi_dma_addr_bind_handle(srpp->srp_dma_handle, NULL,
655 	    base, size, DDI_DMA_READ | DDI_DMA_STREAMING,
656 	    DDI_DMA_DONTWAIT, NULL, &dmac, &ncookies);
657 	switch (err) {
658 	case DDI_DMA_MAPPED:
659 		break;
660 
661 	case DDI_DMA_INUSE:
662 		srp->sr_kstat.srk_dma_bind_fail++;
663 		rc = EEXIST;
664 		goto fail3;
665 
666 	case DDI_DMA_NORESOURCES:
667 		srp->sr_kstat.srk_dma_bind_nomem++;
668 		rc = ENOMEM;
669 		goto fail3;
670 
671 	case DDI_DMA_NOMAPPING:
672 		srp->sr_kstat.srk_dma_bind_fail++;
673 		rc = ENOTSUP;
674 		goto fail3;
675 
676 	case DDI_DMA_TOOBIG:
677 		srp->sr_kstat.srk_dma_bind_fail++;
678 		rc = EFBIG;
679 		goto fail3;
680 
681 	default:
682 		srp->sr_kstat.srk_dma_bind_fail++;
683 		rc = EFAULT;
684 		goto fail3;
685 	}
686 	ASSERT3U(ncookies, ==, 1);
687 
688 	srpp->srp_addr = dmac.dmac_laddress;
689 
690 	srpp->srp_base = (unsigned char *)base;
691 	srpp->srp_mblksize = size;
692 
693 	/*
694 	 * Allocate a STREAMS block: We use size 1 so that the allocator will
695 	 * use the first (and smallest) dblk cache.
696 	 */
697 	freep = &(srpp->srp_free);
698 	freep->free_func = sfxge_rx_qpacket_free;
699 	freep->free_arg  = (caddr_t)srpp;
700 
701 	if ((mp = desballoc(srpp->srp_base, size, BPRI_HI, freep)) == NULL) {
702 		srp->sr_kstat.srk_desballoc_fail++;
703 		rc = ENOMEM;
704 		goto fail4;
705 	}
706 
707 	srpp->srp_mp = mp;
708 	srpp->srp_recycle = B_TRUE;
709 
710 	if (sp->s_rx_pkt_mem_max) {
711 		int64_t delta = sfxge_rx_pkt_mem_approx(srpp);
712 		atomic_add_64(&sp->s_rx_pkt_mem_alloc, delta);
713 	}
714 
715 	return (srpp);
716 
717 fail4:
718 	DTRACE_PROBE(fail4);
719 
720 	bzero(&(srpp->srp_free), sizeof (frtn_t));
721 
722 	srpp->srp_mblksize = 0;
723 	srpp->srp_base = NULL;
724 
725 	/* Unbind the DMA memory from the DMA handle */
726 	srpp->srp_addr = 0;
727 	(void) ddi_dma_unbind_handle(srpp->srp_dma_handle);
728 
729 fail3:
730 	DTRACE_PROBE(fail3);
731 
732 	/* Free the DMA memory */
733 	ddi_dma_mem_free(&(srpp->srp_acc_handle));
734 	srpp->srp_acc_handle = NULL;
735 
736 fail2:
737 	DTRACE_PROBE(fail2);
738 
739 	srpp->srp_putp = NULL;
740 	srpp->srp_srp = NULL;
741 
742 	kmem_cache_free(sp->s_rpc, srpp);
743 
744 fail1:
745 	DTRACE_PROBE1(fail1, int, rc);
746 
747 	return (NULL);
748 }
749 
750 #define	SFXGE_REFILL_BATCH  64
751 
752 /* Try to refill the RX descriptor ring from the associated free pkt pool */
753 static void
754 sfxge_rx_qrefill(sfxge_rxq_t *srp, unsigned int target)
755 {
756 	sfxge_t *sp = srp->sr_sp;
757 	sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
758 	unsigned int index = srp->sr_index;
759 	sfxge_evq_t *sep = sp->s_sep[index];
760 	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
761 	mblk_t *mp;
762 	int ntodo;
763 	unsigned int count;
764 	unsigned int batch;
765 	unsigned int rxfill;
766 	unsigned int mblksize;
767 
768 	prefetch_read_many(sp->s_enp);
769 	prefetch_read_many(srp->sr_erp);
770 
771 	ASSERT(mutex_owned(&(sep->se_lock)));
772 
773 	if (srp->sr_state != SFXGE_RXQ_STARTED)
774 		return;
775 
776 	rxfill = srp->sr_added - srp->sr_completed;
777 	ASSERT3U(rxfill, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));
778 	ntodo = min(EFX_RXQ_LIMIT(sp->s_rxq_size) - rxfill, target);
779 	ASSERT3U(ntodo, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));
780 
781 	if (ntodo == 0)
782 		goto out;
783 
784 	(void) sfxge_rx_qfpp_swizzle(srp);
785 
786 	mp = srfppp->srfpp_get;
787 	count = srfppp->srfpp_count;
788 	mblksize = sp->s_rx_buffer_size - sp->s_rx_buffer_align;
789 
790 	batch = 0;
791 	while (ntodo-- > 0) {
792 		mblk_t *next;
793 		frtn_t *freep;
794 		sfxge_rx_packet_t *srpp;
795 		unsigned int id;
796 
797 		if (mp == NULL)
798 			break;
799 
800 		next = mp->b_next;
801 		mp->b_next = NULL;
802 
803 		if (next != NULL)
804 			prefetch_read_many(next);
805 
806 		freep = DB_FRTNP(mp);
807 		/*LINTED*/
808 		srpp = (sfxge_rx_packet_t *)(freep->free_arg);
809 		ASSERT3P(srpp->srp_mp, ==, mp);
810 
811 		/* The MTU may have changed since the packet was allocated */
812 		if (MBLKSIZE(mp) != mblksize) {
813 			srpp->srp_recycle = B_FALSE;
814 
815 			freeb(mp);
816 
817 			--count;
818 			mp = next;
819 			continue;
820 		}
821 
822 		srpp->srp_off = 0;
823 		srpp->srp_thp = NULL;
824 		srpp->srp_iphp = NULL;
825 		srpp->srp_etherhp = NULL;
826 		srpp->srp_size = 0;
827 		srpp->srp_flags = EFX_DISCARD;
828 
829 		id = (srp->sr_added + batch) & (sp->s_rxq_size - 1);
830 		ASSERT(srp->sr_srpp[id] == NULL);
831 		srp->sr_srpp[id] = srpp;
832 
833 		addr[batch++] = srpp->srp_addr;
834 		if (batch == SFXGE_REFILL_BATCH) {
835 			efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
836 			    srp->sr_completed, srp->sr_added);
837 			srp->sr_added += batch;
838 			batch = 0;
839 		}
840 
841 		--count;
842 		mp = next;
843 	}
844 
845 	srfppp->srfpp_get = mp;
846 	srfppp->srfpp_count = count;
847 
848 	if (batch != 0) {
849 		efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
850 		    srp->sr_completed, srp->sr_added);
851 		srp->sr_added += batch;
852 	}
853 
854 	efx_rx_qpush(srp->sr_erp, srp->sr_added, &srp->sr_pushed);
855 
856 out:
857 	if (srfppp->srfpp_count < srfppp->srfpp_min)
858 		srfppp->srfpp_min = srfppp->srfpp_count;
859 }
860 
861 /* Preallocate packets and put them in the free packet pool */
862 static void
863 sfxge_rx_qpreallocate(sfxge_rxq_t *srp, int nprealloc)
864 {
865 	sfxge_rx_fpp_t *srfppp = &((srp)->sr_fpp);
866 	srfppp->srfpp_lowat = nprealloc;
867 	while (nprealloc-- > 0) {
868 		sfxge_rx_packet_t *srpp;
869 
870 		if ((srpp = sfxge_rx_qpacket_create(srp)) == NULL)
871 			break;
872 		sfxge_rx_qfpp_put(srp, srpp);
873 	}
874 }
875 
876 /* Try to refill the RX descriptor ring by allocating new packets */
877 static void
878 sfxge_rx_qfill(sfxge_rxq_t *srp, unsigned int target)
879 {
880 	sfxge_t *sp = srp->sr_sp;
881 	unsigned int index = srp->sr_index;
882 	sfxge_evq_t *sep = sp->s_sep[index];
883 	unsigned int batch;
884 	unsigned int rxfill;
885 	unsigned int mblksize;
886 	int ntodo;
887 	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
888 	mblk_t *mp = NULL;
889 
890 	prefetch_read_many(sp->s_enp);
891 	prefetch_read_many(srp->sr_erp);
892 
893 	ASSERT(mutex_owned(&(sep->se_lock)));
894 
895 	if (srp->sr_state != SFXGE_RXQ_STARTED)
896 		return;
897 
898 	rxfill = srp->sr_added - srp->sr_completed;
899 	ASSERT3U(rxfill, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));
900 	ntodo = min(EFX_RXQ_LIMIT(sp->s_rxq_size) - rxfill, target);
901 	ASSERT3U(ntodo, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));
902 
903 	if (ntodo == 0)
904 		return;
905 
906 	mblksize = sp->s_rx_buffer_size - sp->s_rx_buffer_align;
907 
908 	batch = 0;
909 	while (ntodo-- > 0) {
910 		sfxge_rx_packet_t *srpp;
911 		unsigned int id;
912 
913 		if ((srpp = sfxge_rx_qpacket_create(srp)) == NULL)
914 			break;
915 
916 		mp = srpp->srp_mp;
917 
918 		ASSERT3U(MBLKSIZE(mp), ==, mblksize);
919 
920 		ASSERT3U(srpp->srp_off, ==, 0);
921 		ASSERT3P(srpp->srp_thp, ==, NULL);
922 		ASSERT3P(srpp->srp_iphp, ==, NULL);
923 		ASSERT3P(srpp->srp_etherhp, ==, NULL);
924 		ASSERT3U(srpp->srp_size, ==, 0);
925 
926 		srpp->srp_flags = EFX_DISCARD;
927 
928 		id = (srp->sr_added + batch) & (sp->s_rxq_size - 1);
929 		ASSERT(srp->sr_srpp[id] == NULL);
930 		srp->sr_srpp[id] = srpp;
931 
932 		addr[batch++] = srpp->srp_addr;
933 		if (batch == SFXGE_REFILL_BATCH) {
934 			efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
935 			    srp->sr_completed, srp->sr_added);
936 			srp->sr_added += batch;
937 			batch = 0;
938 		}
939 	}
940 
941 	if (batch != 0) {
942 		efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
943 		    srp->sr_completed, srp->sr_added);
944 		srp->sr_added += batch;
945 	}
946 
947 	efx_rx_qpush(srp->sr_erp, srp->sr_added, &srp->sr_pushed);
948 }
949 
950 void
951 sfxge_rx_qfpp_trim(sfxge_rxq_t *srp)
952 {
953 	sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
954 	sfxge_t *sp = srp->sr_sp;
955 	unsigned int index = srp->sr_index;
956 	sfxge_evq_t *sep = sp->s_sep[index];
957 	mblk_t *p;
958 	mblk_t **pp;
959 	int count;
960 
961 	ASSERT(mutex_owned(&(sep->se_lock)));
962 
963 	if (srp->sr_state != SFXGE_RXQ_STARTED)
964 		goto done;
965 
966 	/* Make sure the queue is full */
967 	sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
968 
969 	/* The refill may have emptied the pool */
970 	if (srfppp->srfpp_min == 0)
971 		goto done;
972 
973 	/* Don't trim below the pool's low water mark */
974 	if (srfppp->srfpp_count <= srfppp->srfpp_lowat)
975 		goto done;
976 
977 	ASSERT(srfppp->srfpp_min <= srfppp->srfpp_count);
978 
979 	/* Trim to the largest of srfppp->srfpp_min and srfpp->srfpp_lowat */
980 	if (srfppp->srfpp_lowat > srfppp->srfpp_min)
981 		count = srfppp->srfpp_count - srfppp->srfpp_lowat;
982 	else
983 		count = srfppp->srfpp_count - srfppp->srfpp_min;
984 
985 	/* Walk the get list */
986 	pp = &(srfppp->srfpp_get);
987 	while (--count >= 0) {
988 		ASSERT(pp);
989 		p = *pp;
990 		ASSERT(p != NULL);
991 
992 		pp = &(p->b_next);
993 	}
994 	ASSERT(pp);
995 	p = *pp;
996 
997 	/* Truncate the get list */
998 	*pp = NULL;
999 
1000 	/* Free the remainder */
1001 	while (p != NULL) {
1002 		mblk_t *next;
1003 		frtn_t *freep;
1004 		sfxge_rx_packet_t *srpp;
1005 
1006 		next = p->b_next;
1007 		p->b_next = NULL;
1008 
1009 		ASSERT3U(srfppp->srfpp_min, >, 0);
1010 		srfppp->srfpp_min--;
1011 		srfppp->srfpp_count--;
1012 
1013 		freep = DB_FRTNP(p);
1014 		/*LINTED*/
1015 		srpp = (sfxge_rx_packet_t *)(freep->free_arg);
1016 		ASSERT3P(srpp->srp_mp, ==, p);
1017 
1018 		srpp->srp_recycle = B_FALSE;
1019 
1020 		freeb(p);
1021 
1022 		p = next;
1023 	}
1024 
1025 done:
1026 	srfppp->srfpp_min = srfppp->srfpp_count;
1027 }
1028 
1029 static void
1030 sfxge_rx_qpoll(void *arg)
1031 {
1032 	sfxge_rxq_t *srp = arg;
1033 	sfxge_t *sp = srp->sr_sp;
1034 	unsigned int index = srp->sr_index;
1035 	sfxge_evq_t *sep = sp->s_sep[index];
1036 	uint16_t magic;
1037 
1038 	/*
1039 	 * man timeout(9f) states that this code should adhere to the
1040 	 * same requirements as a softirq handler - DO NOT BLOCK
1041 	 */
1042 
1043 	/*
1044 	 * Post an event to the event queue to cause the free packet pool to be
1045 	 * trimmed if it is oversize.
1046 	 */
1047 	magic = SFXGE_MAGIC_RX_QFPP_TRIM | index;
1048 
1049 #if defined(DEBUG)
1050 	/* This is guaranteed due to the start/stop order of rx and ev */
1051 	ASSERT3U(sep->se_state, ==, SFXGE_EVQ_STARTED);
1052 	ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);
1053 #else
1054 	/*
1055 	 * Bug22691 WORKAROUND:
1056 	 * This handler has been observed in the field to be invoked for a
1057 	 * queue in the INITIALIZED state, which should never happen.
1058 	 * Until the mechanism for this is properly understood, add defensive
1059 	 * checks.
1060 	 */
1061 	if ((sep->se_state != SFXGE_EVQ_STARTED) ||
1062 	    (srp->sr_state != SFXGE_RXQ_STARTED) ||
1063 	    (!sep->se_eep)) {
1064 		dev_err(sp->s_dip, CE_WARN, SFXGE_CMN_ERR
1065 		    "RXQ[%d] bad state in sfxge_rx_qpoll %d %d %p",
1066 		    index, sep->se_state, srp->sr_state, sep->se_eep);
1067 		return;
1068 	}
1069 #endif
1070 	efx_ev_qpost(sep->se_eep, magic);
1071 
1072 	srp->sr_tid = timeout(sfxge_rx_qpoll, srp,
1073 	    drv_usectohz(sp->s_rxq_poll_usec));
1074 }
1075 
1076 static void
1077 sfxge_rx_qpoll_start(sfxge_rxq_t *srp)
1078 {
1079 	sfxge_t *sp = srp->sr_sp;
1080 	unsigned int index = srp->sr_index;
1081 	sfxge_evq_t *sep = sp->s_sep[index];
1082 
1083 	ASSERT(mutex_owned(&(sep->se_lock)));
1084 	ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);
1085 
1086 	/* Schedule a poll */
1087 	ASSERT3P(srp->sr_tid, ==, 0);
1088 	srp->sr_tid = timeout(sfxge_rx_qpoll, srp, 0);
1089 }
1090 
1091 static void
1092 sfxge_rx_qpoll_stop(sfxge_rxq_t *srp)
1093 {
1094 	sfxge_t *sp = srp->sr_sp;
1095 	unsigned int index = srp->sr_index;
1096 	sfxge_evq_t *sep = sp->s_sep[index];
1097 	timeout_id_t tid;
1098 
1099 	ASSERT(mutex_owned(&(sep->se_lock)));
1100 	ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);
1101 
1102 	/*
1103 	 * Cancel the qpoll timer. Care is needed as this function
1104 	 * can race with sfxge_rx_qpoll() for timeout id updates.
1105 	 *
1106 	 * Do not hold locks used by any timeout(9f) handlers across
1107 	 * calls to untimeout(9f) as this will deadlock.
1108 	 */
1109 	tid = 0;
1110 	while ((srp->sr_tid != 0) && (srp->sr_tid != tid)) {
1111 		tid = srp->sr_tid;
1112 		(void) untimeout(tid);
1113 	}
1114 	srp->sr_tid = 0;
1115 }
1116 
1117 static int
1118 sfxge_rx_kstat_update(kstat_t *ksp, int rw)
1119 {
1120 	sfxge_rxq_t *srp = ksp->ks_private;
1121 	sfxge_t *sp = srp->sr_sp;
1122 	unsigned int index = srp->sr_index;
1123 	sfxge_evq_t *sep = sp->s_sep[index];
1124 	kstat_named_t *knp;
1125 	int rc;
1126 
1127 	if (rw != KSTAT_READ) {
1128 		rc = EACCES;
1129 		goto fail1;
1130 	}
1131 
1132 	ASSERT(mutex_owned(&(sep->se_lock)));
1133 	if (srp->sr_state != SFXGE_RXQ_STARTED)
1134 		goto done;
1135 
1136 	knp = ksp->ks_data;
1137 	/* NB pointer post-increment below */
1138 	knp++->value.ui32 = srp->sr_kstat.srk_rx_pkt_mem_limit;
1139 	knp++->value.ui32 = srp->sr_kstat.srk_kcache_alloc_nomem;
1140 	knp++->value.ui32 = srp->sr_kstat.srk_dma_alloc_nomem;
1141 	knp++->value.ui32 = srp->sr_kstat.srk_dma_alloc_fail;
1142 	knp++->value.ui32 = srp->sr_kstat.srk_dma_bind_nomem;
1143 	knp++->value.ui32 = srp->sr_kstat.srk_dma_bind_fail;
1144 	knp++->value.ui32 = srp->sr_kstat.srk_desballoc_fail;
1145 	knp++->value.ui32 = srp->sr_kstat.srk_rxq_empty_discard;
1146 
1147 done:
1148 	return (0);
1149 
1150 fail1:
1151 	DTRACE_PROBE1(fail1, int, rc);
1152 
1153 	return (rc);
1154 }
1155 
1156 static int
1157 sfxge_rx_kstat_init(sfxge_rxq_t *srp)
1158 {
1159 	sfxge_t *sp = srp->sr_sp;
1160 	unsigned int index = srp->sr_index;
1161 	sfxge_evq_t *sep = sp->s_sep[index];
1162 	dev_info_t *dip = sp->s_dip;
1163 	char name[MAXNAMELEN];
1164 	kstat_t *ksp;
1165 	kstat_named_t *knp;
1166 	int rc;
1167 
1168 	/* Create the set */
1169 	(void) snprintf(name, MAXNAMELEN - 1, "%s_rxq%04d",
1170 	    ddi_driver_name(dip), index);
1171 
1172 	if ((ksp = kstat_create((char *)ddi_driver_name(dip),
1173 	    ddi_get_instance(dip), name, "rxq", KSTAT_TYPE_NAMED,
1174 	    SFXGE_RX_NSTATS, 0)) == NULL) {
1175 		rc = ENOMEM;
1176 		goto fail1;
1177 	}
1178 
1179 	srp->sr_ksp = ksp;
1180 
1181 	ksp->ks_update = sfxge_rx_kstat_update;
1182 	ksp->ks_private = srp;
1183 	ksp->ks_lock = &(sep->se_lock);
1184 
1185 	/* Initialise the named stats */
1186 	knp = ksp->ks_data;
1187 	kstat_named_init(knp, "rx_pkt_mem_limit", KSTAT_DATA_UINT32);
1188 	knp++;
1189 	kstat_named_init(knp, "kcache_alloc_nomem", KSTAT_DATA_UINT32);
1190 	knp++;
1191 	kstat_named_init(knp, "dma_alloc_nomem", KSTAT_DATA_UINT32);
1192 	knp++;
1193 	kstat_named_init(knp, "dma_alloc_fail", KSTAT_DATA_UINT32);
1194 	knp++;
1195 	kstat_named_init(knp, "dma_bind_nomem", KSTAT_DATA_UINT32);
1196 	knp++;
1197 	kstat_named_init(knp, "dma_bind_fail", KSTAT_DATA_UINT32);
1198 	knp++;
1199 	kstat_named_init(knp, "desballoc_fail", KSTAT_DATA_UINT32);
1200 	knp++;
1201 	kstat_named_init(knp, "rxq_empty_discard", KSTAT_DATA_UINT32);
1202 
1203 	kstat_install(ksp);
1204 	return (0);
1205 
1206 fail1:
1207 	DTRACE_PROBE1(fail1, int, rc);
1208 
1209 	return (rc);
1210 }
1211 
1212 static int
1213 sfxge_rx_qinit(sfxge_t *sp, unsigned int index)
1214 {
1215 	sfxge_rxq_t *srp;
1216 	int rc;
1217 
1218 	ASSERT3U(index, <, SFXGE_RX_SCALE_MAX);
1219 
1220 	if ((srp = kmem_cache_alloc(sp->s_rqc, KM_SLEEP)) == NULL) {
1221 		rc = ENOMEM;
1222 		goto fail1;
1223 	}
1224 	ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_UNINITIALIZED);
1225 
1226 	srp->sr_index = index;
1227 	sp->s_srp[index] = srp;
1228 
1229 	if ((rc = sfxge_rx_kstat_init(srp)) != 0)
1230 		goto fail2;
1231 
1232 	srp->sr_state = SFXGE_RXQ_INITIALIZED;
1233 
1234 	return (0);
1235 
1236 fail2:
1237 	DTRACE_PROBE(fail2);
1238 	kmem_cache_free(sp->s_rqc, srp);
1239 
1240 fail1:
1241 	DTRACE_PROBE1(fail1, int, rc);
1242 
1243 	return (rc);
1244 }
1245 
1246 static int
1247 sfxge_rx_qstart(sfxge_t *sp, unsigned int index)
1248 {
1249 	sfxge_evq_t *sep = sp->s_sep[index];
1250 	sfxge_rxq_t *srp;
1251 	efsys_mem_t *esmp;
1252 	efx_nic_t *enp;
1253 	unsigned int level;
1254 	int rc;
1255 
1256 	mutex_enter(&(sep->se_lock));
1257 	srp = sp->s_srp[index];
1258 	enp = sp->s_enp;
1259 	esmp = &(srp->sr_mem);
1260 
1261 	ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_INITIALIZED);
1262 	ASSERT3U(sep->se_state, ==, SFXGE_EVQ_STARTED);
1263 
1264 	/* Zero the memory */
1265 	bzero(esmp->esm_base, EFX_RXQ_SIZE(sp->s_rxq_size));
1266 
1267 	/* Program the buffer table */
1268 	if ((rc = sfxge_sram_buf_tbl_set(sp, srp->sr_id, esmp,
1269 	    EFX_RXQ_NBUFS(sp->s_rxq_size))) != 0)
1270 		goto fail1;
1271 
1272 	/* Create the receive queue */
1273 	if ((rc = efx_rx_qcreate(enp, index, index, EFX_RXQ_TYPE_DEFAULT,
1274 	    esmp, sp->s_rxq_size, srp->sr_id, sep->se_eep, &(srp->sr_erp)))
1275 	    != 0)
1276 		goto fail2;
1277 
1278 	/* Enable the receive queue */
1279 	efx_rx_qenable(srp->sr_erp);
1280 
1281 	/* Set the water marks */
1282 	srp->sr_hiwat = EFX_RXQ_LIMIT(sp->s_rxq_size) * 9 / 10;
1283 	srp->sr_lowat = srp->sr_hiwat / 2;
1284 
1285 	srp->sr_state = SFXGE_RXQ_STARTED;
1286 	srp->sr_flush = SFXGE_FLUSH_INACTIVE;
1287 
1288 	sfxge_rx_qpoll_start(srp);
1289 
1290 	/* Try to fill the queue from the pool */
1291 	sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
1292 
1293 	/*
1294 	 * If there were insufficient buffers in the pool to reach the at
1295 	 * least a batch then allocate some.
1296 	 */
1297 	level = srp->sr_added - srp->sr_completed;
1298 	if (level < SFXGE_RX_BATCH)
1299 		sfxge_rx_qfill(srp, SFXGE_RX_BATCH);
1300 
1301 	mutex_exit(&(sep->se_lock));
1302 
1303 	return (0);
1304 
1305 fail2:
1306 	DTRACE_PROBE(fail2);
1307 
1308 	/* Clear entries from the buffer table */
1309 	sfxge_sram_buf_tbl_clear(sp, srp->sr_id,
1310 	    EFX_RXQ_NBUFS(sp->s_rxq_size));
1311 
1312 fail1:
1313 	DTRACE_PROBE1(fail1, int, rc);
1314 
1315 	mutex_exit(&(sep->se_lock));
1316 
1317 	return (rc);
1318 }
1319 
1320 static void
1321 sfxge_rx_qflow_complete(sfxge_rxq_t *srp, sfxge_rx_flow_t *srfp)
1322 {
1323 	mblk_t *mp;
1324 	struct ether_header *etherhp;
1325 	struct ip *iphp;
1326 	struct tcphdr *thp;
1327 
1328 	if (srfp->srf_mp == NULL)
1329 		return;
1330 
1331 	mp = srfp->srf_mp;
1332 	etherhp = srfp->srf_etherhp;
1333 	iphp = srfp->srf_iphp;
1334 	thp = srfp->srf_last_thp;
1335 
1336 	ASSERT3U(((etherhp->ether_type == htons(ETHERTYPE_VLAN)) ?
1337 	    sizeof (struct ether_vlan_header) :
1338 	    sizeof (struct ether_header)) +
1339 	    srfp->srf_len, ==, msgdsize(mp));
1340 
1341 	ASSERT3U(srfp->srf_len & 0xffff, ==, srfp->srf_len);
1342 	iphp->ip_len = htons(srfp->srf_len);
1343 
1344 	srfp->srf_first_thp->th_ack = thp->th_ack;
1345 	srfp->srf_first_thp->th_win = thp->th_win;
1346 	srfp->srf_first_thp->th_flags = thp->th_flags;
1347 
1348 	DTRACE_PROBE2(flow_complete, uint32_t, srfp->srf_tag,
1349 	    size_t, srfp->srf_len);
1350 
1351 	srfp->srf_mp = NULL;
1352 	srfp->srf_len = 0;
1353 
1354 	ASSERT(mp->b_next == NULL);
1355 	*(srp->sr_mpp) = mp;
1356 	srp->sr_mpp = &(mp->b_next);
1357 }
1358 
1359 static boolean_t
1360 sfxge_rx_qflow_add(sfxge_rxq_t *srp, sfxge_rx_flow_t *srfp,
1361     sfxge_rx_packet_t *srpp, clock_t now)
1362 {
1363 	sfxge_t *sp = srp->sr_sp;
1364 	struct ether_header *etherhp = srpp->srp_etherhp;
1365 	struct ip *iphp = srpp->srp_iphp;
1366 	struct tcphdr *thp = srpp->srp_thp;
1367 	size_t off = srpp->srp_off;
1368 	size_t size = (size_t)(srpp->srp_size);
1369 	mblk_t *mp = srpp->srp_mp;
1370 	uint32_t seq;
1371 	unsigned int shift;
1372 
1373 	ASSERT3U(MBLKL(mp), ==, off + size);
1374 	ASSERT3U(DB_CKSUMFLAGS(mp), ==,
1375 	    HCK_FULLCKSUM | HCK_FULLCKSUM_OK | HCK_IPV4_HDRCKSUM);
1376 
1377 	seq = htonl(thp->th_seq);
1378 
1379 	/*
1380 	 * If the time between this segment and the last is greater than RTO
1381 	 * then consider this a new flow.
1382 	 */
1383 	if (now - srfp->srf_lbolt > srp->sr_rto) {
1384 		srfp->srf_count = 1;
1385 		srfp->srf_seq = seq + size;
1386 
1387 		goto fail1;
1388 	}
1389 
1390 	if (seq != srfp->srf_seq) {
1391 		if (srfp->srf_count > SFXGE_SLOW_START)
1392 			srfp->srf_count = SFXGE_SLOW_START;
1393 
1394 		srfp->srf_count >>= 1;
1395 
1396 		srfp->srf_count++;
1397 		srfp->srf_seq = seq + size;
1398 
1399 		goto fail2;
1400 	}
1401 
1402 	/* Update the in-order segment count and sequence number */
1403 	srfp->srf_count++;
1404 	srfp->srf_seq = seq + size;
1405 
1406 	/* Don't merge across pure ACK, URG, SYN or RST segments */
1407 	if (size == 0 || thp->th_flags & (TH_URG | TH_SYN | TH_RST) ||
1408 	    thp->th_urp != 0)
1409 		goto fail3;
1410 
1411 	/*
1412 	 * If the in-order segment count has not yet reached the slow-start
1413 	 * threshold then we cannot coalesce.
1414 	 */
1415 	if (srfp->srf_count < SFXGE_SLOW_START)
1416 		goto fail4;
1417 
1418 	/* Scale up the packet size from 4k (the maximum being 64k) */
1419 	ASSERT3U(srfp->srf_count, >=, SFXGE_SLOW_START);
1420 	shift = MIN(srfp->srf_count - SFXGE_SLOW_START + 12, 16);
1421 	if (srfp->srf_len + size >= (1 << shift))
1422 		sfxge_rx_qflow_complete(srp, srfp);
1423 
1424 	ASSERT(mp->b_cont == NULL);
1425 
1426 	if (srfp->srf_mp == NULL) {
1427 		/* First packet in this flow */
1428 		srfp->srf_etherhp = etherhp;
1429 		srfp->srf_iphp = iphp;
1430 		srfp->srf_first_thp = srfp->srf_last_thp = thp;
1431 
1432 		ASSERT3P(mp->b_cont, ==, NULL);
1433 		srfp->srf_mp = mp;
1434 		srfp->srf_mpp = &(mp->b_cont);
1435 
1436 		srfp->srf_len = ntohs(iphp->ip_len);
1437 
1438 		/*
1439 		 * If the flow is not already in the list of occupied flows then
1440 		 * add it.
1441 		 */
1442 		if (srfp->srf_next == NULL &&
1443 		    srp->sr_srfpp != &(srfp->srf_next)) {
1444 			*(srp->sr_srfpp) = srfp;
1445 			srp->sr_srfpp = &(srfp->srf_next);
1446 		}
1447 	} else {
1448 		/* Later packet in this flow - skip TCP header */
1449 		srfp->srf_last_thp = thp;
1450 
1451 		mp->b_rptr += off;
1452 		ASSERT3U(MBLKL(mp), ==, size);
1453 
1454 		ASSERT3P(mp->b_cont, ==, NULL);
1455 		*(srfp->srf_mpp) = mp;
1456 		srfp->srf_mpp = &(mp->b_cont);
1457 
1458 		srfp->srf_len += size;
1459 
1460 		ASSERT(srfp->srf_next != NULL ||
1461 		    srp->sr_srfpp == &(srfp->srf_next));
1462 	}
1463 
1464 	DTRACE_PROBE2(flow_add, uint32_t, srfp->srf_tag, size_t, size);
1465 
1466 	/*
1467 	 * Try to align coalesced segments on push boundaries, unless they
1468 	 * are too frequent.
1469 	 */
1470 	if (sp->s_rx_coalesce_mode == SFXGE_RX_COALESCE_ALLOW_PUSH &&
1471 	    thp->th_flags & TH_PUSH)
1472 		sfxge_rx_qflow_complete(srp, srfp);
1473 
1474 	srfp->srf_lbolt = now;
1475 	return (B_TRUE);
1476 
1477 fail4:
1478 fail3:
1479 fail2:
1480 fail1:
1481 	sfxge_rx_qflow_complete(srp, srfp);
1482 
1483 	srfp->srf_lbolt = now;
1484 	return (B_FALSE);
1485 }
1486 
1487 void
1488 sfxge_rx_qpacket_coalesce(sfxge_rxq_t *srp)
1489 {
1490 	sfxge_t *sp = srp->sr_sp;
1491 	clock_t now;
1492 	mblk_t *mp;
1493 	sfxge_rx_flow_t *srfp;
1494 
1495 	ASSERT(sp->s_rx_coalesce_mode != SFXGE_RX_COALESCE_OFF);
1496 
1497 	now = ddi_get_lbolt();
1498 
1499 	mp = srp->sr_mp;
1500 
1501 	srp->sr_mp = NULL;
1502 	srp->sr_mpp = &(srp->sr_mp);
1503 
1504 	/* Start with the last flow to be appended to */
1505 	srfp = *(srp->sr_srfpp);
1506 
1507 	while (mp != NULL) {
1508 		frtn_t *freep;
1509 		sfxge_rx_packet_t *srpp;
1510 		struct ether_header *etherhp;
1511 		struct ip *iphp;
1512 		struct tcphdr *thp;
1513 		size_t off;
1514 		size_t size;
1515 		uint16_t ether_tci;
1516 		uint32_t hash;
1517 		uint32_t tag;
1518 		mblk_t *next;
1519 		sfxge_packet_type_t pkt_type;
1520 		uint16_t sport, dport;
1521 
1522 		next = mp->b_next;
1523 		mp->b_next = NULL;
1524 
1525 		if (next != NULL)
1526 			prefetch_read_many(next);
1527 
1528 		freep = DB_FRTNP(mp);
1529 		/*LINTED*/
1530 		srpp = (sfxge_rx_packet_t *)(freep->free_arg);
1531 		ASSERT3P(srpp->srp_mp, ==, mp);
1532 
1533 		/* If the packet is not TCP then we cannot coalesce it */
1534 		if (~(srpp->srp_flags) & EFX_PKT_TCP)
1535 			goto reject;
1536 
1537 		/*
1538 		 * If the packet is not fully checksummed then we cannot
1539 		 * coalesce it.
1540 		 */
1541 		if (~(srpp->srp_flags) & (EFX_CKSUM_TCPUDP | EFX_CKSUM_IPV4))
1542 			goto reject;
1543 
1544 		/* Parse the TCP header */
1545 		pkt_type = sfxge_pkthdr_parse(mp, &etherhp, &iphp, &thp, &off,
1546 		    &size, &sport, &dport);
1547 		ASSERT(pkt_type == SFXGE_PACKET_TYPE_IPV4_TCP);
1548 		ASSERT(etherhp != NULL);
1549 		ASSERT(iphp != NULL);
1550 		ASSERT(thp != NULL);
1551 		ASSERT(off != 0);
1552 
1553 		if ((iphp->ip_off & ~htons(IP_DF)) != 0)
1554 			goto reject;
1555 
1556 		if (etherhp->ether_type == htons(ETHERTYPE_VLAN)) {
1557 			struct ether_vlan_header *ethervhp;
1558 
1559 			ethervhp = (struct ether_vlan_header *)etherhp;
1560 			ether_tci = ethervhp->ether_tci;
1561 		} else {
1562 			ether_tci = 0;
1563 		}
1564 
1565 		/*
1566 		 * Make sure any minimum length padding is stripped
1567 		 * before we try to add the packet to a flow.
1568 		 */
1569 		ASSERT3U(sp->s_rx_prefix_size + MBLKL(mp), ==,
1570 		    (size_t)(srpp->srp_size));
1571 		ASSERT3U(sp->s_rx_prefix_size + off + size, <=,
1572 		    (size_t)(srpp->srp_size));
1573 
1574 		if (sp->s_rx_prefix_size + off + size <
1575 		    (size_t)(srpp->srp_size))
1576 			mp->b_wptr = mp->b_rptr + off + size;
1577 
1578 		/*
1579 		 * If there is no current flow, or the segment does not match
1580 		 * the current flow then we must attempt to look up the
1581 		 * correct flow in the table.
1582 		 */
1583 		if (srfp == NULL)
1584 			goto lookup;
1585 
1586 		if (srfp->srf_saddr != iphp->ip_src.s_addr ||
1587 		    srfp->srf_daddr != iphp->ip_dst.s_addr)
1588 			goto lookup;
1589 
1590 		if (srfp->srf_sport != thp->th_sport ||
1591 		    srfp->srf_dport != thp->th_dport)
1592 			goto lookup;
1593 
1594 		if (srfp->srf_tci != ether_tci)
1595 			goto lookup;
1596 
1597 add:
1598 		ASSERT(srfp != NULL);
1599 
1600 		srpp->srp_etherhp = etherhp;
1601 		srpp->srp_iphp = iphp;
1602 		srpp->srp_thp = thp;
1603 		srpp->srp_off = off;
1604 
1605 		ASSERT3U(size, <, (1 << 16));
1606 		srpp->srp_size = (uint16_t)size;
1607 
1608 		/* Try to append the packet to the flow */
1609 		if (!sfxge_rx_qflow_add(srp, srfp, srpp, now))
1610 			goto reject;
1611 
1612 		mp = next;
1613 		continue;
1614 
1615 lookup:
1616 		/*
1617 		 * If there is a prefix area then read the hash from that,
1618 		 * otherwise calculate it.
1619 		 */
1620 		if (sp->s_rx_prefix_size != 0) {
1621 			hash = efx_psuedo_hdr_hash_get(sp->s_enp,
1622 			    EFX_RX_HASHALG_TOEPLITZ,
1623 			    DB_BASE(mp));
1624 		} else {
1625 			SFXGE_TCP_HASH(sp,
1626 			    &iphp->ip_src.s_addr,
1627 			    thp->th_sport,
1628 			    &iphp->ip_dst.s_addr,
1629 			    thp->th_dport,
1630 			    hash);
1631 		}
1632 
1633 		srfp = &(srp->sr_flow[(hash >> 6) % SFXGE_MAX_FLOW]);
1634 		tag = hash + 1; /* Make sure it's not zero */
1635 
1636 		/*
1637 		 * If the flow we have found does not match the hash then
1638 		 * it may be an unused flow, or it may be stale.
1639 		 */
1640 		if (tag != srfp->srf_tag) {
1641 			if (srfp->srf_count != 0) {
1642 				if (now - srfp->srf_lbolt <= srp->sr_rto)
1643 					goto reject;
1644 			}
1645 
1646 			if (srfp->srf_mp != NULL)
1647 				goto reject;
1648 
1649 			/* Start a new flow */
1650 			ASSERT(srfp->srf_next == NULL);
1651 
1652 			srfp->srf_tag = tag;
1653 
1654 			srfp->srf_saddr = iphp->ip_src.s_addr;
1655 			srfp->srf_daddr = iphp->ip_dst.s_addr;
1656 			srfp->srf_sport = thp->th_sport;
1657 			srfp->srf_dport = thp->th_dport;
1658 			srfp->srf_tci = ether_tci;
1659 
1660 			srfp->srf_count = 0;
1661 			srfp->srf_seq = ntohl(thp->th_seq);
1662 
1663 			srfp->srf_lbolt = now;
1664 			goto add;
1665 		}
1666 
1667 		/*
1668 		 * If the flow we have found does match the hash then it could
1669 		 * still be an alias.
1670 		 */
1671 		if (srfp->srf_saddr != iphp->ip_src.s_addr ||
1672 		    srfp->srf_daddr != iphp->ip_dst.s_addr)
1673 			goto reject;
1674 
1675 		if (srfp->srf_sport != thp->th_sport ||
1676 		    srfp->srf_dport != thp->th_dport)
1677 			goto reject;
1678 
1679 		if (srfp->srf_tci != ether_tci)
1680 			goto reject;
1681 
1682 		goto add;
1683 
1684 reject:
1685 		*(srp->sr_mpp) = mp;
1686 		srp->sr_mpp = &(mp->b_next);
1687 
1688 		mp = next;
1689 	}
1690 }
1691 
1692 void
1693 sfxge_rx_qcomplete(sfxge_rxq_t *srp, boolean_t eop)
1694 {
1695 	sfxge_t *sp = srp->sr_sp;
1696 	unsigned int index = srp->sr_index;
1697 	sfxge_evq_t *sep = sp->s_sep[index];
1698 	unsigned int completed;
1699 	sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
1700 	unsigned int level;
1701 
1702 	ASSERT(mutex_owned(&(sep->se_lock)));
1703 
1704 	ASSERT(srp->sr_mp == NULL);
1705 	ASSERT(srp->sr_mpp == &(srp->sr_mp));
1706 
1707 	completed = srp->sr_completed;
1708 	while (completed != srp->sr_pending) {
1709 		unsigned int id;
1710 		sfxge_rx_packet_t *srpp;
1711 		mblk_t *mp;
1712 		size_t size;
1713 		uint16_t flags;
1714 		int rc;
1715 
1716 		id = completed++ & (sp->s_rxq_size - 1);
1717 
1718 		if (srp->sr_pending - completed >= 4) {
1719 			unsigned int prefetch;
1720 
1721 			prefetch = (id + 4) & (sp->s_rxq_size - 1);
1722 
1723 			srpp = srp->sr_srpp[prefetch];
1724 			ASSERT(srpp != NULL);
1725 
1726 			mp = srpp->srp_mp;
1727 			prefetch_read_many(mp->b_datap);
1728 		} else if (completed == srp->sr_pending) {
1729 			prefetch_read_many(srp->sr_mp);
1730 		}
1731 
1732 		srpp = srp->sr_srpp[id];
1733 		ASSERT(srpp != NULL);
1734 
1735 		srp->sr_srpp[id] = NULL;
1736 
1737 		mp = srpp->srp_mp;
1738 		ASSERT(mp->b_cont == NULL);
1739 
1740 		/* when called from sfxge_rx_qstop() */
1741 		if (srp->sr_state != SFXGE_RXQ_STARTED)
1742 			goto discard;
1743 
1744 		if (srpp->srp_flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
1745 			goto discard;
1746 
1747 		/* Make the data visible to the kernel */
1748 		rc = ddi_dma_sync(srpp->srp_dma_handle, 0,
1749 		    sp->s_rx_buffer_size, DDI_DMA_SYNC_FORKERNEL);
1750 		ASSERT3P(rc, ==, DDI_SUCCESS);
1751 
1752 		/* Read the length from the psuedo header if required */
1753 		if (srpp->srp_flags & EFX_PKT_PREFIX_LEN) {
1754 			rc = efx_psuedo_hdr_pkt_length_get(sp->s_enp,
1755 			    mp->b_rptr,
1756 			    &srpp->srp_size);
1757 			ASSERT3P(rc, ==, 0);
1758 			srpp->srp_size += sp->s_rx_prefix_size;
1759 		}
1760 
1761 		/* Set up the packet length */
1762 		ASSERT3P(mp->b_rptr, ==, DB_BASE(mp));
1763 		mp->b_rptr += sp->s_rx_prefix_size;
1764 
1765 		prefetch_read_many(mp->b_rptr);
1766 
1767 		ASSERT3P(mp->b_wptr, ==, DB_BASE(mp));
1768 		mp->b_wptr += (size_t)(srpp->srp_size);
1769 		ASSERT3P(mp->b_wptr, <=, DB_LIM(mp));
1770 
1771 		/* Calculate the maximum packet size */
1772 		size = sp->s_mtu;
1773 		size += (srpp->srp_flags & EFX_PKT_VLAN_TAGGED) ?
1774 		    sizeof (struct ether_vlan_header) :
1775 		    sizeof (struct ether_header);
1776 
1777 		if (MBLKL(mp) > size)
1778 			goto discard;
1779 
1780 		/* Check for loopback packets */
1781 		if (!(srpp->srp_flags & EFX_PKT_IPV4) &&
1782 		    !(srpp->srp_flags & EFX_PKT_IPV6)) {
1783 			struct ether_header *etherhp;
1784 
1785 			/*LINTED*/
1786 			etherhp = (struct ether_header *)(mp->b_rptr);
1787 
1788 			if (etherhp->ether_type ==
1789 			    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
1790 				DTRACE_PROBE(loopback);
1791 
1792 				srp->sr_loopback++;
1793 				goto discard;
1794 			}
1795 		}
1796 
1797 		/* Set up the checksum information */
1798 		flags = 0;
1799 
1800 		if (srpp->srp_flags & EFX_CKSUM_IPV4) {
1801 			ASSERT(srpp->srp_flags & EFX_PKT_IPV4);
1802 			flags |= HCK_IPV4_HDRCKSUM;
1803 		}
1804 
1805 		if (srpp->srp_flags & EFX_CKSUM_TCPUDP) {
1806 			ASSERT(srpp->srp_flags & EFX_PKT_TCP ||
1807 			    srpp->srp_flags & EFX_PKT_UDP);
1808 			flags |= HCK_FULLCKSUM | HCK_FULLCKSUM_OK;
1809 		}
1810 
1811 		DB_CKSUMSTART(mp) = 0;
1812 		DB_CKSUMSTUFF(mp) = 0;
1813 		DB_CKSUMEND(mp) = 0;
1814 		DB_CKSUMFLAGS(mp) = flags;
1815 		DB_CKSUM16(mp) = 0;
1816 
1817 		/* Add the packet to the tail of the chain */
1818 		srfppp->srfpp_loaned++;
1819 
1820 		ASSERT(mp->b_next == NULL);
1821 		*(srp->sr_mpp) = mp;
1822 		srp->sr_mpp = &(mp->b_next);
1823 
1824 		continue;
1825 
1826 discard:
1827 		/* Return the packet to the pool */
1828 		srfppp->srfpp_loaned++;
1829 		freeb(mp); /* Equivalent to freemsg() as b_cont==0 */
1830 	}
1831 	srp->sr_completed = completed;
1832 
1833 	/* Attempt to coalesce any TCP packets */
1834 	if (sp->s_rx_coalesce_mode != SFXGE_RX_COALESCE_OFF)
1835 		sfxge_rx_qpacket_coalesce(srp);
1836 
1837 	/*
1838 	 * If there are any pending flows and this is the end of the
1839 	 * poll then they must be completed.
1840 	 */
1841 	if (srp->sr_srfp != NULL && eop) {
1842 		sfxge_rx_flow_t *srfp;
1843 
1844 		srfp = srp->sr_srfp;
1845 
1846 		srp->sr_srfp = NULL;
1847 		srp->sr_srfpp = &(srp->sr_srfp);
1848 
1849 		do {
1850 			sfxge_rx_flow_t *next;
1851 
1852 			next = srfp->srf_next;
1853 			srfp->srf_next = NULL;
1854 
1855 			sfxge_rx_qflow_complete(srp, srfp);
1856 
1857 			srfp = next;
1858 		} while (srfp != NULL);
1859 	}
1860 
1861 	level = srp->sr_pushed - srp->sr_completed;
1862 
1863 	/* If there are any packets then pass them up the stack */
1864 	if (srp->sr_mp != NULL) {
1865 		mblk_t *mp;
1866 
1867 		mp = srp->sr_mp;
1868 
1869 		srp->sr_mp = NULL;
1870 		srp->sr_mpp = &(srp->sr_mp);
1871 
1872 		if (level == 0) {
1873 			/* Try to refill ASAP */
1874 			sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
1875 			level = srp->sr_pushed - srp->sr_completed;
1876 		}
1877 
1878 		/*
1879 		 * If the RXQ is still empty, discard and recycle the
1880 		 * current entry to ensure that the ring always
1881 		 * contains at least one descriptor. This ensures that
1882 		 * the next hardware RX will trigger an event
1883 		 * (possibly delayed by interrupt moderation) and
1884 		 * trigger another refill/fill attempt.
1885 		 *
1886 		 * Note this drops a complete LRO fragment from the
1887 		 * start of the batch.
1888 		 *
1889 		 * Note also that copymsgchain() does not help with
1890 		 * resource starvation here, unless we are short of DMA
1891 		 * mappings.
1892 		 */
1893 		if (level == 0) {
1894 			mblk_t *nmp;
1895 
1896 			srp->sr_kstat.srk_rxq_empty_discard++;
1897 			DTRACE_PROBE1(rxq_empty_discard, int, index);
1898 			nmp = mp->b_next;
1899 			if (nmp)
1900 				sfxge_gld_rx_post(sp, index, nmp);
1901 			/* as level==0 will swizzle,rxpost below */
1902 			freemsg(mp);
1903 		} else {
1904 			sfxge_gld_rx_post(sp, index, mp);
1905 		}
1906 	}
1907 
1908 	/* Top up the queue if necessary */
1909 	if (level < srp->sr_hiwat) {
1910 		sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
1911 
1912 		level = srp->sr_added - srp->sr_completed;
1913 		if (level < srp->sr_lowat)
1914 			sfxge_rx_qfill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
1915 	}
1916 }
1917 
1918 void
1919 sfxge_rx_qflush_done(sfxge_rxq_t *srp)
1920 {
1921 	sfxge_t *sp = srp->sr_sp;
1922 	unsigned int index = srp->sr_index;
1923 	sfxge_evq_t *sep = sp->s_sep[index];
1924 	boolean_t flush_pending;
1925 
1926 	ASSERT(mutex_owned(&(sep->se_lock)));
1927 
1928 	/*
1929 	 * Flush successful: wakeup sfxge_rx_qstop() if flush is pending.
1930 	 *
1931 	 * A delayed flush event received after RxQ stop has timed out
1932 	 * will be ignored, as then the flush state will not be PENDING
1933 	 * (see SFCbug22989).
1934 	 */
1935 	flush_pending = (srp->sr_flush == SFXGE_FLUSH_PENDING);
1936 	srp->sr_flush = SFXGE_FLUSH_DONE;
1937 	if (flush_pending)
1938 		cv_broadcast(&(srp->sr_flush_kv));
1939 }
1940 
1941 void
1942 sfxge_rx_qflush_failed(sfxge_rxq_t *srp)
1943 {
1944 	sfxge_t *sp = srp->sr_sp;
1945 	unsigned int index = srp->sr_index;
1946 	sfxge_evq_t *sep = sp->s_sep[index];
1947 	boolean_t flush_pending;
1948 
1949 	ASSERT(mutex_owned(&(sep->se_lock)));
1950 
1951 	/*
1952 	 * Flush failed: wakeup sfxge_rx_qstop() if flush is pending.
1953 	 *
1954 	 * A delayed flush event received after RxQ stop has timed out
1955 	 * will be ignored, as then the flush state will not be PENDING
1956 	 * (see SFCbug22989).
1957 	 */
1958 	flush_pending = (srp->sr_flush == SFXGE_FLUSH_PENDING);
1959 	srp->sr_flush = SFXGE_FLUSH_FAILED;
1960 	if (flush_pending)
1961 		cv_broadcast(&(srp->sr_flush_kv));
1962 }
1963 
1964 static void
1965 sfxge_rx_qstop(sfxge_t *sp, unsigned int index)
1966 {
1967 	dev_info_t *dip = sp->s_dip;
1968 	sfxge_evq_t *sep = sp->s_sep[index];
1969 	sfxge_rxq_t *srp;
1970 	clock_t timeout;
1971 	unsigned int flush_tries = SFXGE_RX_QFLUSH_TRIES;
1972 	int rc;
1973 
1974 	ASSERT(mutex_owned(&(sp->s_state_lock)));
1975 
1976 	mutex_enter(&(sep->se_lock));
1977 
1978 	srp = sp->s_srp[index];
1979 	ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);
1980 
1981 	sfxge_rx_qpoll_stop(srp);
1982 
1983 	/* Further packets are discarded by sfxge_rx_qcomplete() */
1984 	srp->sr_state = SFXGE_RXQ_INITIALIZED;
1985 
1986 	if (sp->s_hw_err != SFXGE_HW_OK) {
1987 		/*
1988 		 * Flag indicates possible hardware failure.
1989 		 * Attempt flush but do not wait for it to complete.
1990 		 */
1991 		srp->sr_flush = SFXGE_FLUSH_DONE;
1992 		(void) efx_rx_qflush(srp->sr_erp);
1993 	}
1994 
1995 	/* Wait upto 2sec for queue flushing to complete */
1996 	timeout = ddi_get_lbolt() + drv_usectohz(SFXGE_RX_QFLUSH_USEC);
1997 
1998 	while (srp->sr_flush != SFXGE_FLUSH_DONE && flush_tries-- > 0) {
1999 		if ((rc = efx_rx_qflush(srp->sr_erp)) != 0) {
2000 			if (rc == EALREADY)
2001 				srp->sr_flush = SFXGE_FLUSH_DONE;
2002 			else
2003 				srp->sr_flush = SFXGE_FLUSH_FAILED;
2004 			break;
2005 		}
2006 		srp->sr_flush = SFXGE_FLUSH_PENDING;
2007 		if (cv_timedwait(&(srp->sr_flush_kv), &(sep->se_lock),
2008 		    timeout) < 0) {
2009 			/* Timeout waiting for successful or failed flush */
2010 			dev_err(dip, CE_NOTE,
2011 			    SFXGE_CMN_ERR "rxq[%d] flush timeout", index);
2012 			break;
2013 		}
2014 	}
2015 
2016 	if (srp->sr_flush == SFXGE_FLUSH_FAILED)
2017 		dev_err(dip, CE_NOTE,
2018 		    SFXGE_CMN_ERR "rxq[%d] flush failed", index);
2019 
2020 	DTRACE_PROBE1(flush, sfxge_flush_state_t, srp->sr_flush);
2021 	srp->sr_flush = SFXGE_FLUSH_DONE;
2022 
2023 	/* Destroy the receive queue */
2024 	efx_rx_qdestroy(srp->sr_erp);
2025 	srp->sr_erp = NULL;
2026 
2027 	/* Clear entries from the buffer table */
2028 	sfxge_sram_buf_tbl_clear(sp, srp->sr_id,
2029 	    EFX_RXQ_NBUFS(sp->s_rxq_size));
2030 
2031 	/*
2032 	 * Free any unused RX packets which had descriptors on the RXQ
2033 	 * Packets will be discard as state != STARTED
2034 	 */
2035 	srp->sr_pending = srp->sr_added;
2036 	sfxge_rx_qcomplete(srp, B_TRUE);
2037 
2038 	ASSERT3U(srp->sr_completed, ==, srp->sr_pending);
2039 
2040 	srp->sr_added = 0;
2041 	srp->sr_pushed = 0;
2042 	srp->sr_pending = 0;
2043 	srp->sr_completed = 0;
2044 	srp->sr_loopback = 0;
2045 
2046 	srp->sr_lowat = 0;
2047 	srp->sr_hiwat = 0;
2048 
2049 	mutex_exit(&(sep->se_lock));
2050 }
2051 
2052 static void
2053 sfxge_rx_kstat_fini(sfxge_rxq_t *srp)
2054 {
2055 	kstat_delete(srp->sr_ksp);
2056 	srp->sr_ksp = NULL;
2057 }
2058 
2059 static void
2060 sfxge_rx_qfini(sfxge_t *sp, unsigned int index)
2061 {
2062 	sfxge_rxq_t *srp = sp->s_srp[index];
2063 
2064 	ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_INITIALIZED);
2065 
2066 	sp->s_srp[index] = NULL;
2067 	srp->sr_state = SFXGE_RXQ_UNINITIALIZED;
2068 
2069 	sfxge_rx_kstat_fini(srp);
2070 
2071 	/* Empty the pool */
2072 	sfxge_rx_qfpp_empty(srp);
2073 
2074 	srp->sr_index = 0;
2075 
2076 	kmem_cache_free(sp->s_rqc, srp);
2077 }
2078 
2079 static int
2080 sfxge_rx_scale_kstat_update(kstat_t *ksp, int rw)
2081 {
2082 	sfxge_t *sp = ksp->ks_private;
2083 	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2084 	sfxge_intr_t *sip = &(sp->s_intr);
2085 	kstat_named_t *knp;
2086 	unsigned int index;
2087 	unsigned int entry;
2088 	unsigned int *freq;
2089 	int rc;
2090 
2091 	ASSERT(mutex_owned(&(srsp->srs_lock)));
2092 
2093 	if (rw != KSTAT_READ) {
2094 		rc = EACCES;
2095 		goto fail1;
2096 	}
2097 
2098 	if ((freq = kmem_zalloc(sizeof (unsigned int) * sip->si_nalloc,
2099 	    KM_NOSLEEP)) == NULL) {
2100 		rc = ENOMEM;
2101 		goto fail2;
2102 	}
2103 
2104 	for (entry = 0; entry < SFXGE_RX_SCALE_MAX; entry++) {
2105 		index = srsp->srs_tbl[entry];
2106 
2107 		freq[index]++;
2108 	}
2109 
2110 	knp = ksp->ks_data;
2111 	for (index = 0; index < sip->si_nalloc; index++) {
2112 		knp->value.ui64 = freq[index];
2113 		knp++;
2114 	}
2115 
2116 	knp->value.ui64 = srsp->srs_count;
2117 
2118 	kmem_free(freq, sizeof (unsigned int) * sip->si_nalloc);
2119 
2120 	return (0);
2121 
2122 fail2:
2123 	DTRACE_PROBE(fail2);
2124 fail1:
2125 	DTRACE_PROBE1(fail1, int, rc);
2126 	return (rc);
2127 }
2128 
2129 static int
2130 sfxge_rx_scale_kstat_init(sfxge_t *sp)
2131 {
2132 	dev_info_t *dip = sp->s_dip;
2133 	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2134 	sfxge_intr_t *sip = &(sp->s_intr);
2135 	char name[MAXNAMELEN];
2136 	kstat_t *ksp;
2137 	kstat_named_t *knp;
2138 	unsigned int index;
2139 	int rc;
2140 
2141 	/* Create the set */
2142 	(void) snprintf(name, MAXNAMELEN - 1, "%s_rss", ddi_driver_name(dip));
2143 
2144 	if ((ksp = kstat_create((char *)ddi_driver_name(dip),
2145 	    ddi_get_instance(dip), name, "rss", KSTAT_TYPE_NAMED,
2146 	    sip->si_nalloc + 1, 0)) == NULL) {
2147 		rc = ENOMEM;
2148 		goto fail1;
2149 	}
2150 
2151 	srsp->srs_ksp = ksp;
2152 
2153 	ksp->ks_update = sfxge_rx_scale_kstat_update;
2154 	ksp->ks_private = sp;
2155 	ksp->ks_lock = &(srsp->srs_lock);
2156 
2157 	/* Initialise the named stats */
2158 	knp = ksp->ks_data;
2159 	for (index = 0; index < sip->si_nalloc; index++) {
2160 		char name[MAXNAMELEN];
2161 
2162 		(void) snprintf(name, MAXNAMELEN - 1, "evq%04d_count", index);
2163 		kstat_named_init(knp, name, KSTAT_DATA_UINT64);
2164 		knp++;
2165 	}
2166 
2167 	kstat_named_init(knp, "scale", KSTAT_DATA_UINT64);
2168 
2169 	kstat_install(ksp);
2170 	return (0);
2171 
2172 fail1:
2173 	DTRACE_PROBE1(fail1, int, rc);
2174 
2175 	return (rc);
2176 }
2177 
2178 static void
2179 sfxge_rx_scale_kstat_fini(sfxge_t *sp)
2180 {
2181 	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2182 
2183 	/* Destroy the set */
2184 	kstat_delete(srsp->srs_ksp);
2185 	srsp->srs_ksp = NULL;
2186 }
2187 
2188 
2189 unsigned int
2190 sfxge_rx_scale_prop_get(sfxge_t *sp)
2191 {
2192 	int rx_scale;
2193 
2194 	rx_scale = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
2195 	    DDI_PROP_DONTPASS, "rx_scale_count", SFXGE_RX_SCALE_MAX);
2196 	/* 0 and all -ve numbers sets to number of logical CPUs */
2197 	if (rx_scale <= 0)
2198 		rx_scale = ncpus;
2199 
2200 	return (rx_scale);
2201 }
2202 
2203 
2204 static int
2205 sfxge_rx_scale_init(sfxge_t *sp)
2206 {
2207 	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2208 	sfxge_intr_t *sip = &(sp->s_intr);
2209 	int rc;
2210 
2211 	ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_UNINITIALIZED);
2212 
2213 	/* Create tables for CPU, core, cache and chip counts */
2214 	srsp->srs_cpu = kmem_zalloc(sizeof (unsigned int) * NCPU, KM_SLEEP);
2215 
2216 	mutex_init(&(srsp->srs_lock), NULL, MUTEX_DRIVER, NULL);
2217 
2218 	/* We need at least one event queue */
2219 	srsp->srs_count = sfxge_rx_scale_prop_get(sp);
2220 	if (srsp->srs_count > sip->si_nalloc)
2221 		srsp->srs_count = sip->si_nalloc;
2222 	if (srsp->srs_count < 1)
2223 		srsp->srs_count = 1;
2224 
2225 	/* Set up the kstats */
2226 	if ((rc = sfxge_rx_scale_kstat_init(sp)) != 0)
2227 		goto fail1;
2228 
2229 	srsp->srs_state = SFXGE_RX_SCALE_INITIALIZED;
2230 
2231 	return (0);
2232 
2233 fail1:
2234 	DTRACE_PROBE1(fail1, int, rc);
2235 	mutex_destroy(&(srsp->srs_lock));
2236 
2237 	return (rc);
2238 }
2239 
2240 void
2241 sfxge_rx_scale_update(void *arg)
2242 {
2243 	sfxge_t *sp = arg;
2244 	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2245 	sfxge_intr_t *sip;
2246 	processorid_t id;
2247 	unsigned int count;
2248 	unsigned int *tbl;
2249 	unsigned int *rating;
2250 	unsigned int entry;
2251 	int rc;
2252 
2253 	mutex_enter(&(srsp->srs_lock));
2254 
2255 	if (srsp->srs_state != SFXGE_RX_SCALE_STARTED) {
2256 		rc = EFAULT;
2257 		goto fail1;
2258 	}
2259 
2260 	if ((tbl =  kmem_zalloc(sizeof (unsigned int) * SFXGE_RX_SCALE_MAX,
2261 	    KM_NOSLEEP)) == NULL) {
2262 		rc = ENOMEM;
2263 		goto fail2;
2264 	}
2265 
2266 	sip = &(sp->s_intr);
2267 	if ((rating = kmem_zalloc(sizeof (unsigned int) * sip->si_nalloc,
2268 	    KM_NOSLEEP)) == NULL) {
2269 		rc = ENOMEM;
2270 		goto fail3;
2271 	}
2272 
2273 	mutex_enter(&cpu_lock);
2274 
2275 	/*
2276 	 * Substract any current CPU, core, cache and chip usage from the
2277 	 * global contention tables.
2278 	 */
2279 	for (id = 0; id < NCPU; id++) {
2280 		ASSERT3U(sfxge_cpu[id], >=, srsp->srs_cpu[id]);
2281 		sfxge_cpu[id] -= srsp->srs_cpu[id];
2282 		srsp->srs_cpu[id] = 0;
2283 	}
2284 
2285 	ASSERT(srsp->srs_count != 0);
2286 
2287 	/* Choose as many event queues as we need */
2288 	for (count = 0; count < srsp->srs_count; count++) {
2289 		unsigned int index;
2290 		sfxge_evq_t *sep;
2291 		unsigned int choice;
2292 		unsigned int choice_rating;
2293 
2294 		bzero(rating, sizeof (unsigned int) * sip->si_nalloc);
2295 
2296 		/*
2297 		 * Rate each event queue on its global level of CPU
2298 		 * contention.
2299 		 */
2300 		for (index = 0; index < sip->si_nalloc; index++) {
2301 			sep = sp->s_sep[index];
2302 
2303 			id = sep->se_cpu_id;
2304 			rating[index] += sfxge_cpu[id];
2305 		}
2306 
2307 		/* Choose the queue with the lowest CPU contention */
2308 		choice = 0;
2309 		choice_rating = rating[0];
2310 
2311 		for (index = 1; index < sip->si_nalloc; index++) {
2312 			if (rating[index] < choice_rating) {
2313 				choice = index;
2314 				choice_rating = rating[index];
2315 			}
2316 		}
2317 
2318 		/* Add our choice to the condensed RSS table */
2319 		tbl[count] = choice;
2320 
2321 		/* Add information to the global contention tables */
2322 		sep = sp->s_sep[choice];
2323 
2324 		id = sep->se_cpu_id;
2325 		srsp->srs_cpu[id]++;
2326 		sfxge_cpu[id]++;
2327 	}
2328 
2329 	mutex_exit(&cpu_lock);
2330 
2331 	/* Build the expanded RSS table */
2332 	count = 0;
2333 	for (entry = 0; entry < SFXGE_RX_SCALE_MAX; entry++) {
2334 		unsigned int index;
2335 
2336 		index = tbl[count];
2337 		count = (count + 1) % srsp->srs_count;
2338 
2339 		srsp->srs_tbl[entry] = index;
2340 	}
2341 
2342 	/* Program the expanded RSS table into the hardware */
2343 	(void) efx_rx_scale_tbl_set(sp->s_enp, srsp->srs_tbl,
2344 	    SFXGE_RX_SCALE_MAX);
2345 
2346 	mutex_exit(&(srsp->srs_lock));
2347 	kmem_free(rating, sizeof (unsigned int) * sip->si_nalloc);
2348 	kmem_free(tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);
2349 	return;
2350 
2351 fail3:
2352 	DTRACE_PROBE(fail3);
2353 	kmem_free(tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);
2354 fail2:
2355 	DTRACE_PROBE(fail2);
2356 fail1:
2357 	DTRACE_PROBE1(fail1, int, rc);
2358 
2359 	mutex_exit(&(srsp->srs_lock));
2360 }
2361 
2362 static int
2363 sfxge_rx_scale_start(sfxge_t *sp)
2364 {
2365 	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2366 	int rc;
2367 
2368 	mutex_enter(&(srsp->srs_lock));
2369 
2370 	ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_INITIALIZED);
2371 
2372 	/* Clear down the RSS table */
2373 	bzero(srsp->srs_tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);
2374 
2375 	(void) efx_rx_scale_tbl_set(sp->s_enp, srsp->srs_tbl,
2376 	    SFXGE_RX_SCALE_MAX);
2377 
2378 	if ((rc = sfxge_toeplitz_hash_init(sp)) != 0)
2379 		goto fail1;
2380 
2381 	srsp->srs_state = SFXGE_RX_SCALE_STARTED;
2382 
2383 	mutex_exit(&(srsp->srs_lock));
2384 
2385 	/* sfxge_t->s_state_lock held */
2386 	(void) ddi_taskq_dispatch(sp->s_tqp, sfxge_rx_scale_update, sp,
2387 	    DDI_SLEEP);
2388 
2389 	return (0);
2390 
2391 fail1:
2392 	DTRACE_PROBE1(fail1, int, rc);
2393 
2394 	mutex_exit(&(srsp->srs_lock));
2395 
2396 	return (rc);
2397 }
2398 
2399 int
2400 sfxge_rx_scale_count_get(sfxge_t *sp, unsigned int *countp)
2401 {
2402 	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2403 	int rc;
2404 
2405 	mutex_enter(&(srsp->srs_lock));
2406 
2407 	if (srsp->srs_state != SFXGE_RX_SCALE_INITIALIZED &&
2408 	    srsp->srs_state != SFXGE_RX_SCALE_STARTED) {
2409 		rc = ENOTSUP;
2410 		goto fail1;
2411 	}
2412 
2413 	*countp = srsp->srs_count;
2414 
2415 	mutex_exit(&(srsp->srs_lock));
2416 
2417 	return (0);
2418 
2419 fail1:
2420 	DTRACE_PROBE1(fail1, int, rc);
2421 
2422 	mutex_exit(&(srsp->srs_lock));
2423 
2424 	return (rc);
2425 }
2426 
2427 int
2428 sfxge_rx_scale_count_set(sfxge_t *sp, unsigned int count)
2429 {
2430 	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2431 	sfxge_intr_t *sip = &(sp->s_intr);
2432 	int dispatch = 1;
2433 	int rc;
2434 
2435 	if (count < 1 || count > sip->si_nalloc) {
2436 		rc = EINVAL;
2437 		goto fail1;
2438 	}
2439 
2440 	mutex_enter(&(srsp->srs_lock));
2441 
2442 	if (srsp->srs_state != SFXGE_RX_SCALE_INITIALIZED &&
2443 	    srsp->srs_state != SFXGE_RX_SCALE_STARTED) {
2444 		rc = ENOTSUP;
2445 		goto fail2;
2446 	}
2447 
2448 	srsp->srs_count = count;
2449 
2450 	if (srsp->srs_state != SFXGE_RX_SCALE_STARTED)
2451 		dispatch = 0;
2452 
2453 	mutex_exit(&(srsp->srs_lock));
2454 
2455 	if (dispatch)
2456 		/* no locks held */
2457 		(void) ddi_taskq_dispatch(sp->s_tqp, sfxge_rx_scale_update, sp,
2458 		    DDI_SLEEP);
2459 
2460 	return (0);
2461 
2462 fail2:
2463 	DTRACE_PROBE(fail2);
2464 
2465 	mutex_exit(&(srsp->srs_lock));
2466 
2467 fail1:
2468 	DTRACE_PROBE1(fail1, int, rc);
2469 
2470 	return (rc);
2471 }
2472 
2473 static void
2474 sfxge_rx_scale_stop(sfxge_t *sp)
2475 {
2476 	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2477 	processorid_t id;
2478 
2479 	mutex_enter(&(srsp->srs_lock));
2480 
2481 	ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_STARTED);
2482 
2483 	srsp->srs_state = SFXGE_RX_SCALE_INITIALIZED;
2484 
2485 	mutex_enter(&cpu_lock);
2486 
2487 	/*
2488 	 * Substract any current CPU, core, cache and chip usage from the
2489 	 * global contention tables.
2490 	 */
2491 	for (id = 0; id < NCPU; id++) {
2492 		ASSERT3U(sfxge_cpu[id], >=, srsp->srs_cpu[id]);
2493 		sfxge_cpu[id] -= srsp->srs_cpu[id];
2494 		srsp->srs_cpu[id] = 0;
2495 	}
2496 
2497 	mutex_exit(&cpu_lock);
2498 
2499 	/* Clear down the RSS table */
2500 	bzero(srsp->srs_tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);
2501 
2502 	(void) efx_rx_scale_tbl_set(sp->s_enp, srsp->srs_tbl,
2503 	    SFXGE_RX_SCALE_MAX);
2504 
2505 	mutex_exit(&(srsp->srs_lock));
2506 }
2507 
2508 static void
2509 sfxge_rx_scale_fini(sfxge_t *sp)
2510 {
2511 	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2512 
2513 	ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_INITIALIZED);
2514 
2515 	srsp->srs_state = SFXGE_RX_SCALE_UNINITIALIZED;
2516 
2517 	/* Tear down the kstats */
2518 	sfxge_rx_scale_kstat_fini(sp);
2519 
2520 	srsp->srs_count = 0;
2521 
2522 	mutex_destroy(&(srsp->srs_lock));
2523 
2524 	/* Destroy tables */
2525 	kmem_free(srsp->srs_cpu, sizeof (unsigned int) * NCPU);
2526 	srsp->srs_cpu = NULL;
2527 
2528 	sfxge_toeplitz_hash_fini(sp);
2529 }
2530 
2531 int
2532 sfxge_rx_init(sfxge_t *sp)
2533 {
2534 	sfxge_intr_t *sip = &(sp->s_intr);
2535 	char name[MAXNAMELEN];
2536 	int index;
2537 	int rc;
2538 
2539 	if (sip->si_state == SFXGE_INTR_UNINITIALIZED) {
2540 		rc = EINVAL;
2541 		goto fail1;
2542 	}
2543 
2544 	if ((rc = sfxge_rx_scale_init(sp)) != 0)
2545 		goto fail2;
2546 
2547 	(void) snprintf(name, MAXNAMELEN - 1, "%s%d_rx_packet_cache",
2548 	    ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2549 
2550 	sp->s_rpc = kmem_cache_create(name, sizeof (sfxge_rx_packet_t),
2551 	    SFXGE_CPU_CACHE_SIZE, sfxge_rx_packet_ctor, sfxge_rx_packet_dtor,
2552 	    NULL, sp, NULL, 0);
2553 	ASSERT(sp->s_rpc != NULL);
2554 
2555 	(void) snprintf(name, MAXNAMELEN - 1, "%s%d_rxq_cache",
2556 	    ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2557 
2558 	sp->s_rqc = kmem_cache_create(name, sizeof (sfxge_rxq_t),
2559 	    SFXGE_CPU_CACHE_SIZE, sfxge_rx_qctor, sfxge_rx_qdtor, NULL, sp,
2560 	    NULL, 0);
2561 	ASSERT(sp->s_rqc != NULL);
2562 
2563 	sp->s_rx_pkt_mem_max = ddi_prop_get_int64(DDI_DEV_T_ANY, sp->s_dip,
2564 	    DDI_PROP_DONTPASS, "rx_pkt_mem_max", 0); /* disabled */
2565 
2566 	/* Initialize the receive queue(s) */
2567 	for (index = 0; index < sip->si_nalloc; index++) {
2568 		if ((rc = sfxge_rx_qinit(sp, index)) != 0)
2569 			goto fail3;
2570 	}
2571 
2572 	sp->s_rx_coalesce_mode = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
2573 	    DDI_PROP_DONTPASS, "rx_coalesce_mode", SFXGE_RX_COALESCE_OFF);
2574 
2575 	return (0);
2576 
2577 fail3:
2578 	DTRACE_PROBE(fail3);
2579 
2580 	/* Tear down the receive queue(s) */
2581 	while (--index >= 0)
2582 		sfxge_rx_qfini(sp, index);
2583 
2584 	kmem_cache_destroy(sp->s_rqc);
2585 	sp->s_rqc = NULL;
2586 
2587 	kmem_cache_destroy(sp->s_rpc);
2588 	sp->s_rpc = NULL;
2589 
2590 	sfxge_rx_scale_fini(sp);
2591 
2592 fail2:
2593 	DTRACE_PROBE(fail2);
2594 fail1:
2595 	DTRACE_PROBE1(fail1, int, rc);
2596 
2597 	return (rc);
2598 }
2599 
2600 int
2601 sfxge_rx_start(sfxge_t *sp)
2602 {
2603 	sfxge_mac_t *smp = &(sp->s_mac);
2604 	sfxge_intr_t *sip;
2605 	const efx_nic_cfg_t *encp;
2606 	size_t hdrlen, align;
2607 	int index;
2608 	int rc;
2609 
2610 	mutex_enter(&(smp->sm_lock));
2611 
2612 	/* Calculate the receive packet buffer size and alignment */
2613 	sp->s_rx_buffer_size = EFX_MAC_PDU(sp->s_mtu);
2614 
2615 	encp = efx_nic_cfg_get(sp->s_enp);
2616 
2617 	/* Packet buffer allocations are cache line aligned */
2618 	EFSYS_ASSERT3U(encp->enc_rx_buf_align_start, <=, SFXGE_CPU_CACHE_SIZE);
2619 
2620 	if (sp->s_family == EFX_FAMILY_HUNTINGTON) {
2621 		sp->s_rx_prefix_size = encp->enc_rx_prefix_size;
2622 
2623 		hdrlen = sp->s_rx_prefix_size + sizeof (struct ether_header);
2624 
2625 		/* Ensure IP headers are 32bit aligned */
2626 		sp->s_rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
2627 		sp->s_rx_buffer_size += sp->s_rx_buffer_align;
2628 
2629 	} else if (encp->enc_features & EFX_FEATURE_LFSR_HASH_INSERT) {
2630 		sp->s_rx_prefix_size = encp->enc_rx_prefix_size;
2631 
2632 		/*
2633 		 * Place the start of the buffer a prefix length minus 2
2634 		 * before the start of a cache line. This ensures that the
2635 		 * last two bytes of the prefix (which is where the LFSR hash
2636 		 * is located) are in the same cache line as the headers, and
2637 		 * the IP header is 32-bit aligned.
2638 		 */
2639 		sp->s_rx_buffer_align =
2640 		    SFXGE_CPU_CACHE_SIZE - (encp->enc_rx_prefix_size - 2);
2641 		sp->s_rx_buffer_size += sp->s_rx_buffer_align;
2642 	} else {
2643 		sp->s_rx_prefix_size = 0;
2644 
2645 		/*
2646 		 * Place the start of the buffer 2 bytes after a cache line
2647 		 * boundary so that the headers fit into the cache line and
2648 		 * the IP header is 32-bit aligned.
2649 		 */
2650 		hdrlen = sp->s_rx_prefix_size + sizeof (struct ether_header);
2651 
2652 		sp->s_rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
2653 		sp->s_rx_buffer_size += sp->s_rx_buffer_align;
2654 	}
2655 
2656 	/* Align end of packet buffer for RX DMA end padding */
2657 	align = MAX(1, encp->enc_rx_buf_align_end);
2658 	EFSYS_ASSERT(ISP2(align));
2659 	sp->s_rx_buffer_size = P2ROUNDUP(sp->s_rx_buffer_size, align);
2660 
2661 	/* Initialize the receive module */
2662 	if ((rc = efx_rx_init(sp->s_enp)) != 0)
2663 		goto fail1;
2664 
2665 	mutex_exit(&(smp->sm_lock));
2666 
2667 	if ((rc = sfxge_rx_scale_start(sp)) != 0)
2668 		goto fail2;
2669 
2670 	/* Start the receive queue(s) */
2671 	sip = &(sp->s_intr);
2672 	for (index = 0; index < sip->si_nalloc; index++) {
2673 		if ((rc = sfxge_rx_qstart(sp, index)) != 0)
2674 			goto fail3;
2675 	}
2676 
2677 	ASSERT3U(sp->s_srp[0]->sr_state, ==, SFXGE_RXQ_STARTED);
2678 	/* It is sufficient to have Rx scale initialized */
2679 	ASSERT3U(sp->s_rx_scale.srs_state, ==, SFXGE_RX_SCALE_STARTED);
2680 	rc = efx_mac_filter_default_rxq_set(sp->s_enp, sp->s_srp[0]->sr_erp,
2681 	    sp->s_rx_scale.srs_count > 1);
2682 	if (rc != 0)
2683 		goto fail4;
2684 
2685 	return (0);
2686 
2687 fail4:
2688 	DTRACE_PROBE(fail4);
2689 
2690 fail3:
2691 	DTRACE_PROBE(fail3);
2692 
2693 	/* Stop the receive queue(s) */
2694 	while (--index >= 0)
2695 		sfxge_rx_qstop(sp, index);
2696 
2697 	sfxge_rx_scale_stop(sp);
2698 
2699 fail2:
2700 	DTRACE_PROBE(fail2);
2701 
2702 	mutex_enter(&(smp->sm_lock));
2703 
2704 	/* Tear down the receive module */
2705 	efx_rx_fini(sp->s_enp);
2706 
2707 fail1:
2708 	DTRACE_PROBE1(fail1, int, rc);
2709 
2710 	mutex_exit(&(smp->sm_lock));
2711 
2712 	return (rc);
2713 }
2714 
2715 void
2716 sfxge_rx_coalesce_mode_get(sfxge_t *sp, sfxge_rx_coalesce_mode_t *modep)
2717 {
2718 	*modep = sp->s_rx_coalesce_mode;
2719 }
2720 
2721 int
2722 sfxge_rx_coalesce_mode_set(sfxge_t *sp, sfxge_rx_coalesce_mode_t mode)
2723 {
2724 	int rc;
2725 
2726 	switch (mode) {
2727 	case SFXGE_RX_COALESCE_OFF:
2728 	case SFXGE_RX_COALESCE_DISALLOW_PUSH:
2729 	case SFXGE_RX_COALESCE_ALLOW_PUSH:
2730 		break;
2731 
2732 	default:
2733 		rc = EINVAL;
2734 		goto fail1;
2735 	}
2736 
2737 	sp->s_rx_coalesce_mode = mode;
2738 
2739 	return (0);
2740 
2741 fail1:
2742 	DTRACE_PROBE1(fail1, int, rc);
2743 
2744 	return (rc);
2745 }
2746 
2747 void
2748 sfxge_rx_stop(sfxge_t *sp)
2749 {
2750 	sfxge_mac_t *smp = &(sp->s_mac);
2751 	sfxge_intr_t *sip = &(sp->s_intr);
2752 	efx_nic_t *enp = sp->s_enp;
2753 	int index;
2754 
2755 	ASSERT(mutex_owned(&(sp->s_state_lock)));
2756 
2757 	efx_mac_filter_default_rxq_clear(enp);
2758 
2759 	/* Stop the receive queue(s) */
2760 	index = sip->si_nalloc;
2761 	while (--index >= 0) {
2762 		/* TBD: Flush RXQs in parallel; HW has limit + may need retry */
2763 		sfxge_rx_qstop(sp, index);
2764 	}
2765 
2766 	sfxge_rx_scale_stop(sp);
2767 
2768 	mutex_enter(&(smp->sm_lock));
2769 
2770 	/* Tear down the receive module */
2771 	efx_rx_fini(enp);
2772 
2773 	sp->s_rx_buffer_align = 0;
2774 	sp->s_rx_prefix_size = 0;
2775 	sp->s_rx_buffer_size = 0;
2776 
2777 	mutex_exit(&(smp->sm_lock));
2778 }
2779 
2780 unsigned int
2781 sfxge_rx_loaned(sfxge_t *sp)
2782 {
2783 	sfxge_intr_t *sip = &(sp->s_intr);
2784 	int index;
2785 	unsigned int loaned;
2786 
2787 	ASSERT3U(sip->si_state, ==, SFXGE_INTR_INITIALIZED);
2788 
2789 	loaned = 0;
2790 	for (index = 0; index < sip->si_nalloc; index++) {
2791 		sfxge_rxq_t *srp = sp->s_srp[index];
2792 		sfxge_evq_t *sep = sp->s_sep[srp->sr_index];
2793 
2794 		mutex_enter(&(sep->se_lock));
2795 
2796 		loaned += sfxge_rx_qfpp_swizzle(srp);
2797 
2798 		mutex_exit(&(sep->se_lock));
2799 	}
2800 
2801 	return (loaned);
2802 }
2803 
2804 void
2805 sfxge_rx_fini(sfxge_t *sp)
2806 {
2807 	sfxge_intr_t *sip = &(sp->s_intr);
2808 	int index;
2809 
2810 	ASSERT3U(sip->si_state, ==, SFXGE_INTR_INITIALIZED);
2811 
2812 	sp->s_rx_coalesce_mode = SFXGE_RX_COALESCE_OFF;
2813 
2814 	/* Tear down the receive queue(s) */
2815 	index = sip->si_nalloc;
2816 	while (--index >= 0)
2817 		sfxge_rx_qfini(sp, index);
2818 
2819 	ASSERT3U(sp->s_rx_pkt_mem_alloc, ==, 0);
2820 
2821 	kmem_cache_destroy(sp->s_rqc);
2822 	sp->s_rqc = NULL;
2823 
2824 	kmem_cache_destroy(sp->s_rpc);
2825 	sp->s_rpc = NULL;
2826 
2827 	sfxge_rx_scale_fini(sp);
2828 }
2829