xref: /illumos-gate/usr/src/uts/common/io/sfxge/sfxge_tx.c (revision 13b136d3061155363c62c9f6568d25b8b27da8f6)
1 /*
2  * Copyright (c) 2008-2016 Solarflare Communications Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  *    this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *    this list of conditions and the following disclaimer in the documentation
12  *    and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are
27  * those of the authors and should not be interpreted as representing official
28  * policies, either expressed or implied, of the FreeBSD Project.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/sysmacros.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/atomic.h>
36 #include <sys/stream.h>
37 #include <sys/strsun.h>
38 #include <sys/strsubr.h>
39 #include <sys/pattr.h>
40 #include <sys/cpu.h>
41 
42 #include <sys/ethernet.h>
43 #include <inet/ip.h>
44 
45 #include <netinet/in.h>
46 #include <netinet/ip.h>
47 #include <netinet/tcp.h>
48 
49 #include "sfxge.h"
50 
51 #include "efx.h"
52 
53 /* TXQ flush response timeout (in microseconds) */
54 #define	SFXGE_TX_QFLUSH_USEC	(2000000)
55 
56 /* See sfxge.conf.private for descriptions */
57 #define	SFXGE_TX_DPL_GET_PKT_LIMIT_DEFAULT 4096
58 #define	SFXGE_TX_DPL_PUT_PKT_LIMIT_DEFAULT 256
59 
60 
61 /* Transmit buffer DMA attributes */
62 static ddi_device_acc_attr_t sfxge_tx_buffer_devacc = {
63 
64 	DDI_DEVICE_ATTR_V0,	/* devacc_attr_version */
65 	DDI_NEVERSWAP_ACC,	/* devacc_attr_endian_flags */
66 	DDI_STRICTORDER_ACC	/* devacc_attr_dataorder */
67 };
68 
69 static ddi_dma_attr_t sfxge_tx_buffer_dma_attr = {
70 	DMA_ATTR_V0,		/* dma_attr_version	*/
71 	0,			/* dma_attr_addr_lo	*/
72 	0xffffffffffffffffull,	/* dma_attr_addr_hi	*/
73 	0xffffffffffffffffull,	/* dma_attr_count_max	*/
74 	SFXGE_TX_BUFFER_SIZE,	/* dma_attr_align	*/
75 	0xffffffff,		/* dma_attr_burstsizes	*/
76 	1,			/* dma_attr_minxfer	*/
77 	0xffffffffffffffffull,	/* dma_attr_maxxfer	*/
78 	0xffffffffffffffffull,	/* dma_attr_seg		*/
79 	1,			/* dma_attr_sgllen	*/
80 	1,			/* dma_attr_granular	*/
81 	0			/* dma_attr_flags	*/
82 };
83 
84 /* Transmit mapping DMA attributes */
85 static ddi_dma_attr_t sfxge_tx_mapping_dma_attr = {
86 	DMA_ATTR_V0,		/* dma_attr_version	*/
87 	0,			/* dma_attr_addr_lo	*/
88 	0xffffffffffffffffull,	/* dma_attr_addr_hi	*/
89 	0xffffffffffffffffull,	/* dma_attr_count_max	*/
90 	1,			/* dma_attr_align	*/
91 	0xffffffff,		/* dma_attr_burstsizes	*/
92 	1,			/* dma_attr_minxfer	*/
93 	0xffffffffffffffffull,	/* dma_attr_maxxfer	*/
94 	0xffffffffffffffffull,	/* dma_attr_seg		*/
95 	0x7fffffff,		/* dma_attr_sgllen	*/
96 	1,			/* dma_attr_granular	*/
97 	0			/* dma_attr_flags	*/
98 };
99 
100 /* Transmit queue DMA attributes */
101 static ddi_device_acc_attr_t sfxge_txq_devacc = {
102 
103 	DDI_DEVICE_ATTR_V0,	/* devacc_attr_version */
104 	DDI_NEVERSWAP_ACC,	/* devacc_attr_endian_flags */
105 	DDI_STRICTORDER_ACC	/* devacc_attr_dataorder */
106 };
107 
108 static ddi_dma_attr_t sfxge_txq_dma_attr = {
109 	DMA_ATTR_V0,		/* dma_attr_version	*/
110 	0,			/* dma_attr_addr_lo	*/
111 	0xffffffffffffffffull,	/* dma_attr_addr_hi	*/
112 	0xffffffffffffffffull,	/* dma_attr_count_max	*/
113 	EFX_BUF_SIZE,		/* dma_attr_align	*/
114 	0xffffffff,		/* dma_attr_burstsizes	*/
115 	1,			/* dma_attr_minxfer	*/
116 	0xffffffffffffffffull,	/* dma_attr_maxxfer	*/
117 	0xffffffffffffffffull,	/* dma_attr_seg		*/
118 	1,			/* dma_attr_sgllen	*/
119 	1,			/* dma_attr_granular	*/
120 	0			/* dma_attr_flags	*/
121 };
122 
123 
124 /*
125  * A sfxge_tx_qdpl_swizzle() can happen when the DPL get list is one packet
126  * under the limit, and must move all packets from the DPL put->get list
127  * Hence this is the real maximum length of the TX DPL get list.
128  */
129 static int
130 sfxge_tx_dpl_get_pkt_max(sfxge_txq_t *stp)
131 {
132 	sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
133 	return (stdp->get_pkt_limit + stdp->put_pkt_limit - 1);
134 }
135 
136 
137 static int
138 sfxge_tx_packet_ctor(void *buf, void *arg, int kmflags)
139 {
140 	_NOTE(ARGUNUSED(arg, kmflags))
141 
142 	bzero(buf, sizeof (sfxge_tx_packet_t));
143 
144 	return (0);
145 }
146 
147 static void
148 sfxge_tx_packet_dtor(void *buf, void *arg)
149 {
150 	sfxge_tx_packet_t *stpp = buf;
151 
152 	_NOTE(ARGUNUSED(arg))
153 
154 	SFXGE_OBJ_CHECK(stpp, sfxge_tx_packet_t);
155 }
156 
157 static int
158 sfxge_tx_buffer_ctor(void *buf, void *arg, int kmflags)
159 {
160 	sfxge_tx_buffer_t *stbp = buf;
161 	sfxge_t *sp = arg;
162 	sfxge_dma_buffer_attr_t dma_attr;
163 	int rc;
164 
165 	bzero(buf, sizeof (sfxge_tx_buffer_t));
166 
167 	dma_attr.sdba_dip	 = sp->s_dip;
168 	dma_attr.sdba_dattrp	 = &sfxge_tx_buffer_dma_attr;
169 	dma_attr.sdba_callback	 = ((kmflags == KM_SLEEP) ?
170 	    DDI_DMA_SLEEP : DDI_DMA_DONTWAIT);
171 	dma_attr.sdba_length	 = SFXGE_TX_BUFFER_SIZE;
172 	dma_attr.sdba_memflags	 = DDI_DMA_STREAMING;
173 	dma_attr.sdba_devaccp	 = &sfxge_tx_buffer_devacc;
174 	dma_attr.sdba_bindflags	 = DDI_DMA_WRITE | DDI_DMA_STREAMING;
175 	dma_attr.sdba_maxcookies = 1;
176 	dma_attr.sdba_zeroinit	 = B_FALSE;
177 
178 	if ((rc = sfxge_dma_buffer_create(&(stbp->stb_esm), &dma_attr)) != 0)
179 		goto fail1;
180 
181 	return (0);
182 
183 fail1:
184 	DTRACE_PROBE1(fail1, int, rc);
185 
186 	SFXGE_OBJ_CHECK(stbp, sfxge_tx_buffer_t);
187 
188 	return (-1);
189 }
190 
191 static void
192 sfxge_tx_buffer_dtor(void *buf, void *arg)
193 {
194 	sfxge_tx_buffer_t *stbp = buf;
195 
196 	_NOTE(ARGUNUSED(arg))
197 
198 	sfxge_dma_buffer_destroy(&(stbp->stb_esm));
199 
200 	SFXGE_OBJ_CHECK(stbp, sfxge_tx_buffer_t);
201 }
202 
203 static int
204 sfxge_tx_mapping_ctor(void *buf, void *arg, int kmflags)
205 {
206 	sfxge_tx_mapping_t *stmp = buf;
207 	sfxge_t *sp = arg;
208 	dev_info_t *dip = sp->s_dip;
209 	int rc;
210 
211 	bzero(buf, sizeof (sfxge_tx_mapping_t));
212 
213 	stmp->stm_sp = sp;
214 
215 	/* Allocate DMA handle */
216 	rc = ddi_dma_alloc_handle(dip, &sfxge_tx_mapping_dma_attr,
217 	    (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT,
218 	    NULL, &(stmp->stm_dma_handle));
219 	if (rc != DDI_SUCCESS)
220 		goto fail1;
221 
222 	return (0);
223 
224 fail1:
225 	DTRACE_PROBE1(fail1, int, rc);
226 
227 	stmp->stm_sp = NULL;
228 
229 	SFXGE_OBJ_CHECK(stmp, sfxge_tx_mapping_t);
230 
231 	return (-1);
232 }
233 
234 static void
235 sfxge_tx_mapping_dtor(void *buf, void *arg)
236 {
237 	sfxge_tx_mapping_t *stmp = buf;
238 
239 	ASSERT3P(stmp->stm_sp, ==, arg);
240 
241 	/* Free the DMA handle */
242 	ddi_dma_free_handle(&(stmp->stm_dma_handle));
243 	stmp->stm_dma_handle = NULL;
244 
245 	stmp->stm_sp = NULL;
246 
247 	SFXGE_OBJ_CHECK(stmp, sfxge_tx_mapping_t);
248 }
249 
250 static int
251 sfxge_tx_qctor(void *buf, void *arg, int kmflags)
252 {
253 	sfxge_txq_t *stp = buf;
254 	efsys_mem_t *esmp = &(stp->st_mem);
255 	sfxge_t *sp = arg;
256 	sfxge_dma_buffer_attr_t dma_attr;
257 	sfxge_tx_dpl_t *stdp;
258 	int rc;
259 
260 	/* Compile-time structure layout checks */
261 	EFX_STATIC_ASSERT(sizeof (stp->__st_u1.__st_s1) <=
262 	    sizeof (stp->__st_u1.__st_pad));
263 	EFX_STATIC_ASSERT(sizeof (stp->__st_u2.__st_s2) <=
264 	    sizeof (stp->__st_u2.__st_pad));
265 	EFX_STATIC_ASSERT(sizeof (stp->__st_u3.__st_s3) <=
266 	    sizeof (stp->__st_u3.__st_pad));
267 	EFX_STATIC_ASSERT(sizeof (stp->__st_u4.__st_s4) <=
268 	    sizeof (stp->__st_u4.__st_pad));
269 
270 	bzero(buf, sizeof (sfxge_txq_t));
271 
272 	stp->st_sp = sp;
273 
274 	dma_attr.sdba_dip	 = sp->s_dip;
275 	dma_attr.sdba_dattrp	 = &sfxge_txq_dma_attr;
276 	dma_attr.sdba_callback	 = DDI_DMA_SLEEP;
277 	dma_attr.sdba_length	 = EFX_TXQ_SIZE(SFXGE_TX_NDESCS);
278 	dma_attr.sdba_memflags	 = DDI_DMA_CONSISTENT;
279 	dma_attr.sdba_devaccp	 = &sfxge_txq_devacc;
280 	dma_attr.sdba_bindflags	 = DDI_DMA_READ | DDI_DMA_CONSISTENT;
281 	dma_attr.sdba_maxcookies = EFX_TXQ_NBUFS(SFXGE_TX_NDESCS);
282 	dma_attr.sdba_zeroinit	 = B_FALSE;
283 
284 	if ((rc = sfxge_dma_buffer_create(esmp, &dma_attr)) != 0)
285 		goto fail1;
286 
287 	/* Allocate some buffer table entries */
288 	if ((rc = sfxge_sram_buf_tbl_alloc(sp, EFX_TXQ_NBUFS(SFXGE_TX_NDESCS),
289 	    &(stp->st_id))) != 0)
290 		goto fail2;
291 
292 	/* Allocate the descriptor array */
293 	if ((stp->st_eb = kmem_zalloc(sizeof (efx_buffer_t) *
294 	    EFX_TXQ_LIMIT(SFXGE_TX_NDESCS), kmflags)) == NULL) {
295 		rc = ENOMEM;
296 		goto fail3;
297 	}
298 
299 	/* Allocate the context arrays */
300 	if ((stp->st_stmp = kmem_zalloc(sizeof (sfxge_tx_mapping_t *) *
301 	    SFXGE_TX_NDESCS, kmflags)) == NULL) {
302 		rc = ENOMEM;
303 		goto fail4;
304 	}
305 
306 	if ((stp->st_stbp = kmem_zalloc(sizeof (sfxge_tx_buffer_t *) *
307 	    SFXGE_TX_NDESCS, kmflags)) == NULL) {
308 		rc = ENOMEM;
309 		goto fail5;
310 	}
311 
312 	if ((stp->st_mp = kmem_zalloc(sizeof (mblk_t *) *
313 	    SFXGE_TX_NDESCS, kmflags)) == NULL) {
314 		rc = ENOMEM;
315 		goto fail6;
316 	}
317 
318 	/* Initialize the deferred packet list */
319 	stdp = &(stp->st_dpl);
320 	stdp->std_getp = &(stdp->std_get);
321 
322 	stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;
323 
324 	return (0);
325 
326 fail6:
327 	DTRACE_PROBE(fail6);
328 
329 	kmem_free(stp->st_stbp, sizeof (sfxge_tx_buffer_t *) * SFXGE_TX_NDESCS);
330 	stp->st_stbp = NULL;
331 
332 fail5:
333 	DTRACE_PROBE(fail5);
334 
335 	kmem_free(stp->st_stmp,
336 	    sizeof (sfxge_tx_mapping_t *) * SFXGE_TX_NDESCS);
337 	stp->st_stmp = NULL;
338 
339 fail4:
340 	DTRACE_PROBE(fail4);
341 
342 	/* Free the descriptor array */
343 	kmem_free(stp->st_eb, sizeof (efx_buffer_t) *
344 	    EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
345 	stp->st_eb = NULL;
346 
347 fail3:
348 	DTRACE_PROBE(fail3);
349 
350 	/* Free the buffer table entries */
351 	sfxge_sram_buf_tbl_free(sp, stp->st_id, EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));
352 	stp->st_id = 0;
353 
354 fail2:
355 	DTRACE_PROBE(fail2);
356 
357 	/* Tear down DMA setup */
358 	sfxge_dma_buffer_destroy(esmp);
359 
360 fail1:
361 	DTRACE_PROBE1(fail1, int, rc);
362 
363 	stp->st_sp = NULL;
364 
365 	SFXGE_OBJ_CHECK(stp, sfxge_txq_t);
366 
367 	return (-1);
368 }
369 
370 static void
371 sfxge_tx_qdtor(void *buf, void *arg)
372 {
373 	sfxge_txq_t *stp = buf;
374 	efsys_mem_t *esmp = &(stp->st_mem);
375 	sfxge_t *sp = stp->st_sp;
376 	sfxge_tx_dpl_t *stdp;
377 
378 	_NOTE(ARGUNUSED(arg))
379 
380 	stp->st_unblock = 0;
381 
382 	/* Tear down the deferred packet list */
383 	stdp = &(stp->st_dpl);
384 	ASSERT3P(stdp->std_getp, ==, &(stdp->std_get));
385 	stdp->std_getp = NULL;
386 
387 	/* Free the context arrays */
388 	kmem_free(stp->st_mp, sizeof (mblk_t *) * SFXGE_TX_NDESCS);
389 	stp->st_mp = NULL;
390 
391 	kmem_free(stp->st_stbp, sizeof (sfxge_tx_buffer_t *) * SFXGE_TX_NDESCS);
392 	stp->st_stbp = NULL;
393 
394 	kmem_free(stp->st_stmp,
395 	    sizeof (sfxge_tx_mapping_t *) * SFXGE_TX_NDESCS);
396 	stp->st_stmp = NULL;
397 
398 	/* Free the descriptor array */
399 	kmem_free(stp->st_eb, sizeof (efx_buffer_t) *
400 	    EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
401 	stp->st_eb = NULL;
402 
403 	/* Free the buffer table entries */
404 	sfxge_sram_buf_tbl_free(sp, stp->st_id, EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));
405 	stp->st_id = 0;
406 
407 	/* Tear down dma setup */
408 	sfxge_dma_buffer_destroy(esmp);
409 
410 	stp->st_sp = NULL;
411 
412 	SFXGE_OBJ_CHECK(stp, sfxge_txq_t);
413 }
414 
415 static void
416 sfxge_tx_packet_destroy(sfxge_t *sp, sfxge_tx_packet_t *stpp)
417 {
418 	kmem_cache_free(sp->s_tpc, stpp);
419 }
420 
421 static sfxge_tx_packet_t *
422 sfxge_tx_packet_create(sfxge_t *sp)
423 {
424 	sfxge_tx_packet_t *stpp;
425 
426 	stpp = kmem_cache_alloc(sp->s_tpc, KM_NOSLEEP);
427 
428 	return (stpp);
429 }
430 
431 static inline int
432 sfxge_tx_qfpp_put(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp)
433 {
434 	sfxge_tx_fpp_t *stfp = &(stp->st_fpp);
435 
436 	ASSERT(mutex_owned(&(stp->st_lock)));
437 
438 	ASSERT3P(stpp->stp_next, ==, NULL);
439 	ASSERT3P(stpp->stp_mp, ==, NULL);
440 	ASSERT3P(stpp->stp_etherhp, ==, NULL);
441 	ASSERT3P(stpp->stp_iphp, ==, NULL);
442 	ASSERT3P(stpp->stp_thp, ==, NULL);
443 	ASSERT3U(stpp->stp_off, ==, 0);
444 	ASSERT3U(stpp->stp_size, ==, 0);
445 	ASSERT3U(stpp->stp_mss, ==, 0);
446 	ASSERT3U(stpp->stp_dpl_put_len, ==, 0);
447 
448 	if (stfp->stf_count < SFXGE_TX_FPP_MAX) {
449 		/* Add to the start of the list */
450 		stpp->stp_next = stfp->stf_stpp;
451 		stfp->stf_stpp = stpp;
452 		stfp->stf_count++;
453 
454 		return (0);
455 	}
456 
457 	DTRACE_PROBE(fpp_full);
458 	return (ENOSPC);
459 }
460 
461 static inline sfxge_tx_packet_t *
462 sfxge_tx_qfpp_get(sfxge_txq_t *stp)
463 {
464 	sfxge_tx_packet_t *stpp;
465 	sfxge_tx_fpp_t *stfp = &(stp->st_fpp);
466 
467 	ASSERT(mutex_owned(&(stp->st_lock)));
468 
469 	stpp = stfp->stf_stpp;
470 	if (stpp == NULL) {
471 		ASSERT3U(stfp->stf_count, ==, 0);
472 		return (NULL);
473 	}
474 
475 	/* Remove item from the head of the list */
476 	stfp->stf_stpp = stpp->stp_next;
477 	stpp->stp_next = NULL;
478 
479 	ASSERT3U(stfp->stf_count, >, 0);
480 	stfp->stf_count--;
481 
482 	if (stfp->stf_count != 0) {
483 		ASSERT(stfp->stf_stpp != NULL);
484 		prefetch_read_many(stfp->stf_stpp);
485 	}
486 	return (stpp);
487 }
488 
489 static void
490 sfxge_tx_qfpp_empty(sfxge_txq_t *stp)
491 {
492 	sfxge_t *sp = stp->st_sp;
493 	sfxge_tx_fpp_t *stfp = &(stp->st_fpp);
494 	sfxge_tx_packet_t *stpp;
495 
496 	mutex_enter(&(stp->st_lock));
497 
498 	stpp = stfp->stf_stpp;
499 	stfp->stf_stpp = NULL;
500 
501 	while (stpp != NULL) {
502 		sfxge_tx_packet_t *next;
503 
504 		next = stpp->stp_next;
505 		stpp->stp_next = NULL;
506 
507 		ASSERT3U(stfp->stf_count, >, 0);
508 		stfp->stf_count--;
509 
510 		sfxge_tx_packet_destroy(sp, stpp);
511 
512 		stpp = next;
513 	}
514 	ASSERT3U(stfp->stf_count, ==, 0);
515 
516 	mutex_exit(&(stp->st_lock));
517 }
518 
519 static inline void
520 sfxge_tx_qfbp_put(sfxge_txq_t *stp, sfxge_tx_buffer_t *stbp)
521 {
522 	sfxge_tx_fbp_t *stfp = &(stp->st_fbp);
523 
524 	ASSERT3P(stbp->stb_next, ==, NULL);
525 	ASSERT3U(stbp->stb_off, ==, 0);
526 	ASSERT3U(stbp->stb_esm.esm_used, ==, 0);
527 
528 	stbp->stb_next = stfp->stf_stbp;
529 	stfp->stf_stbp = stbp;
530 	stfp->stf_count++;
531 }
532 
533 
534 static inline sfxge_tx_buffer_t *
535 sfxge_tx_qfbp_get(sfxge_txq_t *stp)
536 {
537 	sfxge_tx_buffer_t *stbp;
538 	sfxge_tx_fbp_t *stfp = &(stp->st_fbp);
539 
540 	stbp = stfp->stf_stbp;
541 	if (stbp == NULL) {
542 		ASSERT3U(stfp->stf_count, ==, 0);
543 		return (NULL);
544 	}
545 
546 	stfp->stf_stbp = stbp->stb_next;
547 	stbp->stb_next = NULL;
548 
549 	ASSERT3U(stfp->stf_count, >, 0);
550 	stfp->stf_count--;
551 
552 	if (stfp->stf_count != 0) {
553 		ASSERT(stfp->stf_stbp != NULL);
554 		prefetch_read_many(stfp->stf_stbp);
555 	}
556 
557 	return (stbp);
558 }
559 
560 static void
561 sfxge_tx_qfbp_empty(sfxge_txq_t *stp)
562 {
563 	sfxge_t *sp = stp->st_sp;
564 	sfxge_tx_fbp_t *stfp = &(stp->st_fbp);
565 	sfxge_tx_buffer_t *stbp;
566 
567 	mutex_enter(&(stp->st_lock));
568 
569 	stbp = stfp->stf_stbp;
570 	stfp->stf_stbp = NULL;
571 
572 	while (stbp != NULL) {
573 		sfxge_tx_buffer_t *next;
574 
575 		next = stbp->stb_next;
576 		stbp->stb_next = NULL;
577 
578 		ASSERT3U(stfp->stf_count, >, 0);
579 		stfp->stf_count--;
580 
581 		kmem_cache_free(sp->s_tbc, stbp);
582 
583 		stbp = next;
584 	}
585 	ASSERT3U(stfp->stf_count, ==, 0);
586 
587 	mutex_exit(&(stp->st_lock));
588 }
589 
590 static inline void
591 sfxge_tx_qfmp_put(sfxge_txq_t *stp, sfxge_tx_mapping_t *stmp)
592 {
593 	sfxge_tx_fmp_t *stfp = &(stp->st_fmp);
594 
595 	ASSERT3P(stmp->stm_next, ==, NULL);
596 	ASSERT3P(stmp->stm_mp, ==, NULL);
597 	ASSERT3P(stmp->stm_base, ==, NULL);
598 	ASSERT3U(stmp->stm_off, ==, 0);
599 	ASSERT3U(stmp->stm_size, ==, 0);
600 
601 	stmp->stm_next = stfp->stf_stmp;
602 	stfp->stf_stmp = stmp;
603 	stfp->stf_count++;
604 }
605 
606 static inline sfxge_tx_mapping_t *
607 sfxge_tx_qfmp_get(sfxge_txq_t *stp)
608 {
609 	sfxge_tx_mapping_t *stmp;
610 	sfxge_tx_fmp_t *stfp = &(stp->st_fmp);
611 
612 	stmp = stfp->stf_stmp;
613 	if (stmp == NULL) {
614 		ASSERT3U(stfp->stf_count, ==, 0);
615 		return (NULL);
616 	}
617 
618 	stfp->stf_stmp = stmp->stm_next;
619 	stmp->stm_next = NULL;
620 
621 	ASSERT3U(stfp->stf_count, >, 0);
622 	stfp->stf_count--;
623 
624 	if (stfp->stf_count != 0) {
625 		ASSERT(stfp->stf_stmp != NULL);
626 		prefetch_read_many(stfp->stf_stmp);
627 	}
628 	return (stmp);
629 }
630 
631 static void
632 sfxge_tx_qfmp_empty(sfxge_txq_t *stp)
633 {
634 	sfxge_t *sp = stp->st_sp;
635 	sfxge_tx_fmp_t *stfp = &(stp->st_fmp);
636 	sfxge_tx_mapping_t *stmp;
637 
638 	mutex_enter(&(stp->st_lock));
639 
640 	stmp = stfp->stf_stmp;
641 	stfp->stf_stmp = NULL;
642 
643 	while (stmp != NULL) {
644 		sfxge_tx_mapping_t *next;
645 
646 		next = stmp->stm_next;
647 		stmp->stm_next = NULL;
648 
649 		ASSERT3U(stfp->stf_count, >, 0);
650 		stfp->stf_count--;
651 
652 		kmem_cache_free(sp->s_tmc, stmp);
653 
654 		stmp = next;
655 	}
656 	ASSERT3U(stfp->stf_count, ==, 0);
657 
658 	mutex_exit(&(stp->st_lock));
659 }
660 
661 static void
662 sfxge_tx_msgb_unbind(sfxge_tx_mapping_t *stmp)
663 {
664 	bzero(stmp->stm_addr, sizeof (uint64_t) * SFXGE_TX_MAPPING_NADDR);
665 	stmp->stm_off = 0;
666 
667 	(void) ddi_dma_unbind_handle(stmp->stm_dma_handle);
668 
669 	stmp->stm_size = 0;
670 	stmp->stm_base = NULL;
671 
672 	stmp->stm_mp = NULL;
673 }
674 
675 #define	SFXGE_TX_DESCSHIFT	12
676 #define	SFXGE_TX_DESCSIZE	(1 << 12)
677 
678 #define	SFXGE_TX_DESCOFFSET	(SFXGE_TX_DESCSIZE - 1)
679 #define	SFXGE_TX_DESCMASK	(~SFXGE_TX_DESCOFFSET)
680 
681 static int
682 sfxge_tx_msgb_bind(mblk_t *mp, sfxge_tx_mapping_t *stmp)
683 {
684 	ddi_dma_cookie_t dmac;
685 	unsigned int ncookies;
686 	size_t size;
687 	unsigned int n;
688 	int rc;
689 
690 	ASSERT(mp != NULL);
691 	ASSERT3U(DB_TYPE(mp), ==, M_DATA);
692 
693 	ASSERT(stmp->stm_mp == NULL);
694 	stmp->stm_mp = mp;
695 
696 	stmp->stm_base = (caddr_t)(mp->b_rptr);
697 	stmp->stm_size = MBLKL(mp);
698 
699 	/* Bind the STREAMS block to the mapping */
700 	rc = ddi_dma_addr_bind_handle(stmp->stm_dma_handle, NULL,
701 	    stmp->stm_base, stmp->stm_size, DDI_DMA_WRITE | DDI_DMA_STREAMING,
702 	    DDI_DMA_DONTWAIT, NULL, &dmac, &ncookies);
703 	if (rc != DDI_DMA_MAPPED)
704 		goto fail1;
705 
706 	ASSERT3U(ncookies, <=, SFXGE_TX_MAPPING_NADDR);
707 
708 	/*
709 	 * Construct an array of addresses and an initial
710 	 * offset.
711 	 */
712 	n = 0;
713 	stmp->stm_addr[n++] = dmac.dmac_laddress & SFXGE_TX_DESCMASK;
714 	DTRACE_PROBE1(addr, uint64_t, dmac.dmac_laddress & SFXGE_TX_DESCMASK);
715 
716 	stmp->stm_off = dmac.dmac_laddress & SFXGE_TX_DESCOFFSET;
717 
718 	size = MIN(SFXGE_TX_DESCSIZE - stmp->stm_off, dmac.dmac_size);
719 	dmac.dmac_laddress += size;
720 	dmac.dmac_size -= size;
721 
722 	for (;;) {
723 		ASSERT3U(n, <, SFXGE_TX_MAPPING_NADDR);
724 
725 		if (dmac.dmac_size == 0) {
726 			if (--ncookies == 0)
727 				break;
728 
729 			ddi_dma_nextcookie(stmp->stm_dma_handle, &dmac);
730 		}
731 
732 		ASSERT((dmac.dmac_laddress & SFXGE_TX_DESCMASK) != 0);
733 		ASSERT((dmac.dmac_laddress & SFXGE_TX_DESCOFFSET) == 0);
734 		stmp->stm_addr[n++] = dmac.dmac_laddress;
735 		DTRACE_PROBE1(addr, uint64_t, dmac.dmac_laddress);
736 
737 		size = MIN(SFXGE_TX_DESCSIZE, dmac.dmac_size);
738 		dmac.dmac_laddress += size;
739 		dmac.dmac_size -= size;
740 	}
741 	ASSERT3U(n, <=, SFXGE_TX_MAPPING_NADDR);
742 
743 	return (0);
744 
745 fail1:
746 	DTRACE_PROBE1(fail1, int, rc);
747 
748 	stmp->stm_size = 0;
749 	stmp->stm_base = NULL;
750 
751 	stmp->stm_mp = NULL;
752 
753 	return (-1);
754 }
755 
756 static void
757 sfxge_tx_qreap(sfxge_txq_t *stp)
758 {
759 	unsigned int reaped;
760 
761 	ASSERT(mutex_owned(&(stp->st_lock)));
762 
763 	reaped = stp->st_reaped;
764 	while (reaped != stp->st_completed) {
765 		unsigned int id;
766 		sfxge_tx_mapping_t *stmp;
767 		sfxge_tx_buffer_t *stbp;
768 
769 		id = reaped++ & (SFXGE_TX_NDESCS - 1);
770 
771 		ASSERT3P(stp->st_mp[id], ==, NULL);
772 
773 		if ((stmp = stp->st_stmp[id]) != NULL) {
774 			stp->st_stmp[id] = NULL;
775 
776 			/* Free all the mappings */
777 			do {
778 				sfxge_tx_mapping_t *next;
779 
780 				next = stmp->stm_next;
781 				stmp->stm_next = NULL;
782 
783 				sfxge_tx_qfmp_put(stp, stmp);
784 
785 				stmp = next;
786 			} while (stmp != NULL);
787 		}
788 
789 		if ((stbp = stp->st_stbp[id]) != NULL) {
790 			stp->st_stbp[id] = NULL;
791 
792 			/* Free all the buffers */
793 			do {
794 				sfxge_tx_buffer_t *next;
795 
796 				next = stbp->stb_next;
797 				stbp->stb_next = NULL;
798 
799 				stbp->stb_esm.esm_used = 0;
800 				stbp->stb_off = 0;
801 
802 				sfxge_tx_qfbp_put(stp, stbp);
803 
804 				stbp = next;
805 			} while (stbp != NULL);
806 		}
807 	}
808 	stp->st_reaped = reaped;
809 }
810 
811 static void
812 sfxge_tx_qlist_abort(sfxge_txq_t *stp)
813 {
814 	unsigned int id;
815 	sfxge_tx_mapping_t *stmp;
816 	sfxge_tx_buffer_t *stbp;
817 	mblk_t *mp;
818 
819 	ASSERT(mutex_owned(&(stp->st_lock)));
820 
821 	id = stp->st_added & (SFXGE_TX_NDESCS - 1);
822 
823 	/* Clear the completion information */
824 	stmp = stp->st_stmp[id];
825 	stp->st_stmp[id] = NULL;
826 
827 	/* Free any mappings that were used */
828 	while (stmp != NULL) {
829 		sfxge_tx_mapping_t *next;
830 
831 		next = stmp->stm_next;
832 		stmp->stm_next = NULL;
833 
834 		if (stmp->stm_mp != NULL)
835 			sfxge_tx_msgb_unbind(stmp);
836 
837 		sfxge_tx_qfmp_put(stp, stmp);
838 
839 		stmp = next;
840 	}
841 
842 	stbp = stp->st_stbp[id];
843 	stp->st_stbp[id] = NULL;
844 
845 	/* Free any buffers that were used */
846 	while (stbp != NULL) {
847 		sfxge_tx_buffer_t *next;
848 
849 		next = stbp->stb_next;
850 		stbp->stb_next = NULL;
851 
852 		stbp->stb_off = 0;
853 		stbp->stb_esm.esm_used = 0;
854 
855 		sfxge_tx_qfbp_put(stp, stbp);
856 
857 		stbp = next;
858 	}
859 
860 	mp = stp->st_mp[id];
861 	stp->st_mp[id] = NULL;
862 
863 	if (mp != NULL)
864 		freemsg(mp);
865 
866 	/* Clear the fragment list */
867 	stp->st_n = 0;
868 }
869 
870 /* Push descriptors to the TX ring setting blocked if no space */
871 static void
872 sfxge_tx_qlist_post(sfxge_txq_t *stp)
873 {
874 	unsigned int id;
875 	unsigned int level;
876 	unsigned int available;
877 	int rc;
878 
879 	ASSERT(mutex_owned(&(stp->st_lock)));
880 
881 	ASSERT(stp->st_n != 0);
882 
883 again:
884 	level = stp->st_added - stp->st_reaped;
885 	available = EFX_TXQ_LIMIT(SFXGE_TX_NDESCS) - level;
886 
887 	id = stp->st_added & (SFXGE_TX_NDESCS - 1);
888 
889 	if (available < stp->st_n) {
890 		rc = ENOSPC;
891 		goto fail1;
892 	}
893 
894 	ASSERT3U(available, >=, stp->st_n);
895 
896 	/* Post the fragment list */
897 	if ((rc = efx_tx_qpost(stp->st_etp, stp->st_eb, stp->st_n,
898 	    stp->st_reaped, &(stp->st_added))) != 0)
899 		goto fail2;
900 
901 	/*
902 	 * If the list took more than a single descriptor then we need to
903 	 * to move the completion information so it is referenced by the last
904 	 * descriptor.
905 	 */
906 	if (((stp->st_added - 1) & (SFXGE_TX_NDESCS - 1)) != id) {
907 		sfxge_tx_mapping_t *stmp;
908 		sfxge_tx_buffer_t *stbp;
909 		mblk_t *mp;
910 
911 		stmp = stp->st_stmp[id];
912 		stp->st_stmp[id] = NULL;
913 
914 		stbp = stp->st_stbp[id];
915 		stp->st_stbp[id] = NULL;
916 
917 		mp = stp->st_mp[id];
918 		stp->st_mp[id] = NULL;
919 
920 		id = (stp->st_added - 1) & (SFXGE_TX_NDESCS - 1);
921 
922 		ASSERT(stp->st_stmp[id] == NULL);
923 		stp->st_stmp[id] = stmp;
924 
925 		ASSERT(stp->st_stbp[id] == NULL);
926 		stp->st_stbp[id] = stbp;
927 
928 		ASSERT(stp->st_mp[id] == NULL);
929 		stp->st_mp[id] = mp;
930 	}
931 
932 	/* Clear the list */
933 	stp->st_n = 0;
934 
935 	ASSERT3U(stp->st_unblock, ==, SFXGE_TXQ_NOT_BLOCKED);
936 	return;
937 
938 fail2:
939 	DTRACE_PROBE(fail2);
940 fail1:
941 	DTRACE_PROBE1(fail1, int, rc);
942 
943 	ASSERT(rc == ENOSPC);
944 
945 	level = stp->st_added - stp->st_completed;
946 	available = EFX_TXQ_LIMIT(SFXGE_TX_NDESCS) - level;
947 
948 	/*
949 	 * If there would be enough space after we've reaped any completed
950 	 * mappings and buffers, and we gain sufficient queue space by doing
951 	 * so, then reap now and try posting again.
952 	 */
953 	if (stp->st_n <= available &&
954 	    stp->st_completed - stp->st_reaped >= SFXGE_TX_BATCH) {
955 		sfxge_tx_qreap(stp);
956 
957 		goto again;
958 	}
959 
960 	/* Set the unblock level */
961 	if (stp->st_unblock == SFXGE_TXQ_NOT_BLOCKED) {
962 		stp->st_unblock = SFXGE_TXQ_UNBLOCK_LEVEL1;
963 	} else {
964 		ASSERT(stp->st_unblock == SFXGE_TXQ_UNBLOCK_LEVEL1);
965 
966 		stp->st_unblock = SFXGE_TXQ_UNBLOCK_LEVEL2;
967 	}
968 
969 	/*
970 	 * Avoid a race with completion interrupt handling that could leave the
971 	 * queue blocked.
972 	 *
973 	 * NOTE: The use of st_pending rather than st_completed is intentional
974 	 *	 as st_pending is updated per-event rather than per-batch and
975 	 *	 therefore avoids needless deferring.
976 	 */
977 	if (stp->st_pending == stp->st_added) {
978 		sfxge_tx_qreap(stp);
979 
980 		stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;
981 		goto again;
982 	}
983 
984 	ASSERT(stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED);
985 }
986 
987 static int
988 sfxge_tx_kstat_update(kstat_t *ksp, int rw)
989 {
990 	sfxge_txq_t *stp = ksp->ks_private;
991 	sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
992 	kstat_named_t *knp;
993 	int rc;
994 
995 	ASSERT(mutex_owned(&(stp->st_lock)));
996 
997 	if (rw != KSTAT_READ) {
998 		rc = EACCES;
999 		goto fail1;
1000 	}
1001 
1002 	if (stp->st_state != SFXGE_TXQ_STARTED)
1003 		goto done;
1004 
1005 	efx_tx_qstats_update(stp->st_etp, stp->st_stat);
1006 	knp = (kstat_named_t *)ksp->ks_data + TX_NQSTATS;
1007 	knp->value.ui64 = stdp->get_pkt_limit;
1008 	knp++;
1009 	knp->value.ui64 = stdp->put_pkt_limit;
1010 	knp++;
1011 	knp->value.ui64 = stdp->get_full_count;
1012 	knp++;
1013 	knp->value.ui64 = stdp->put_full_count;
1014 
1015 done:
1016 	return (0);
1017 
1018 fail1:
1019 	DTRACE_PROBE1(fail1, int, rc);
1020 
1021 	return (rc);
1022 }
1023 
1024 static int
1025 sfxge_tx_kstat_init(sfxge_txq_t *stp)
1026 {
1027 	sfxge_t *sp = stp->st_sp;
1028 	unsigned int index = stp->st_index;
1029 	dev_info_t *dip = sp->s_dip;
1030 	kstat_t *ksp;
1031 	kstat_named_t *knp;
1032 	char name[MAXNAMELEN];
1033 	unsigned int id;
1034 	int rc;
1035 
1036 	/* Create the set */
1037 	(void) snprintf(name, MAXNAMELEN - 1, "%s_txq%04d",
1038 	    ddi_driver_name(dip), index);
1039 
1040 	if ((ksp = kstat_create((char *)ddi_driver_name(dip),
1041 	    ddi_get_instance(dip), name, "queue", KSTAT_TYPE_NAMED,
1042 	    TX_NQSTATS + 4, 0)) == NULL) {
1043 		rc = ENOMEM;
1044 		goto fail1;
1045 	}
1046 
1047 	stp->st_ksp = ksp;
1048 
1049 	ksp->ks_update = sfxge_tx_kstat_update;
1050 	ksp->ks_private = stp;
1051 	ksp->ks_lock = &(stp->st_lock);
1052 
1053 	/* Initialise the named stats */
1054 	stp->st_stat = knp = ksp->ks_data;
1055 	for (id = 0; id < TX_NQSTATS; id++) {
1056 		kstat_named_init(knp, (char *)efx_tx_qstat_name(sp->s_enp, id),
1057 		    KSTAT_DATA_UINT64);
1058 		knp++;
1059 	}
1060 	kstat_named_init(knp, "dpl_get_pkt_limit", KSTAT_DATA_UINT64);
1061 	knp++;
1062 	kstat_named_init(knp, "dpl_put_pkt_limit", KSTAT_DATA_UINT64);
1063 	knp++;
1064 	kstat_named_init(knp, "dpl_get_full_count", KSTAT_DATA_UINT64);
1065 	knp++;
1066 	kstat_named_init(knp, "dpl_put_full_count", KSTAT_DATA_UINT64);
1067 
1068 	kstat_install(ksp);
1069 	return (0);
1070 
1071 fail1:
1072 	DTRACE_PROBE1(fail1, int, rc);
1073 
1074 	return (rc);
1075 }
1076 
1077 static void
1078 sfxge_tx_kstat_fini(sfxge_txq_t *stp)
1079 {
1080 	/* Destroy the set */
1081 	kstat_delete(stp->st_ksp);
1082 	stp->st_ksp = NULL;
1083 	stp->st_stat = NULL;
1084 }
1085 
1086 static int
1087 sfxge_tx_qinit(sfxge_t *sp, unsigned int index, sfxge_txq_type_t type,
1088     unsigned int evq)
1089 {
1090 	sfxge_txq_t *stp;
1091 	sfxge_tx_dpl_t *stdp;
1092 	int rc;
1093 
1094 	ASSERT3U(index, <, EFX_ARRAY_SIZE(sp->s_stp));
1095 	ASSERT3U(type, <, SFXGE_TXQ_NTYPES);
1096 	ASSERT3U(evq, <, EFX_ARRAY_SIZE(sp->s_sep));
1097 
1098 	if ((stp = kmem_cache_alloc(sp->s_tqc, KM_SLEEP)) == NULL) {
1099 		rc = ENOMEM;
1100 		goto fail1;
1101 	}
1102 	ASSERT3U(stp->st_state, ==, SFXGE_TXQ_UNINITIALIZED);
1103 
1104 	stdp = &(stp->st_dpl);
1105 
1106 	stp->st_index = index;
1107 	stp->st_type = type;
1108 	stp->st_evq = evq;
1109 
1110 	mutex_init(&(stp->st_lock), NULL, MUTEX_DRIVER,
1111 	    DDI_INTR_PRI(sp->s_intr.si_intr_pri));
1112 
1113 	/* Initialize the statistics */
1114 	if ((rc = sfxge_tx_kstat_init(stp)) != 0)
1115 		goto fail2;
1116 
1117 	stdp->get_pkt_limit = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
1118 	    DDI_PROP_DONTPASS, "tx_dpl_get_pkt_limit",
1119 	    SFXGE_TX_DPL_GET_PKT_LIMIT_DEFAULT);
1120 
1121 	stdp->put_pkt_limit = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
1122 	    DDI_PROP_DONTPASS, "tx_dpl_put_pkt_limit",
1123 	    SFXGE_TX_DPL_PUT_PKT_LIMIT_DEFAULT);
1124 
1125 	/* Allocate a per-EVQ label for events from this TXQ */
1126 	if ((rc = sfxge_ev_txlabel_alloc(sp, evq, stp, &(stp->st_label))) != 0)
1127 		goto fail2;
1128 
1129 	stp->st_state = SFXGE_TXQ_INITIALIZED;
1130 
1131 	/* Attach the TXQ to the driver */
1132 	ASSERT3P(sp->s_stp[index], ==, NULL);
1133 	sp->s_stp[index] = stp;
1134 	sp->s_tx_qcount++;
1135 
1136 	return (0);
1137 
1138 fail2:
1139 	DTRACE_PROBE(fail2);
1140 
1141 	sfxge_tx_kstat_fini(stp);
1142 
1143 
1144 	stp->st_evq = 0;
1145 	stp->st_type = 0;
1146 	stp->st_index = 0;
1147 
1148 	mutex_destroy(&(stp->st_lock));
1149 
1150 	kmem_cache_free(sp->s_tqc, stp);
1151 
1152 fail1:
1153 	DTRACE_PROBE1(fail1, int, rc);
1154 
1155 	return (rc);
1156 }
1157 
1158 static int
1159 sfxge_tx_qstart(sfxge_t *sp, unsigned int index)
1160 {
1161 	sfxge_txq_t *stp = sp->s_stp[index];
1162 	efx_nic_t *enp = sp->s_enp;
1163 	efsys_mem_t *esmp;
1164 	sfxge_evq_t *sep;
1165 	unsigned int evq;
1166 	unsigned int flags;
1167 	unsigned int desc_index;
1168 	int rc;
1169 
1170 	mutex_enter(&(stp->st_lock));
1171 
1172 	esmp = &(stp->st_mem);
1173 	evq = stp->st_evq;
1174 	sep = sp->s_sep[evq];
1175 
1176 	ASSERT3U(stp->st_state, ==, SFXGE_TXQ_INITIALIZED);
1177 	ASSERT3U(sep->se_state, ==, SFXGE_EVQ_STARTED);
1178 
1179 	/* Zero the memory */
1180 	bzero(esmp->esm_base, EFX_TXQ_SIZE(SFXGE_TX_NDESCS));
1181 
1182 	/* Program the buffer table */
1183 	if ((rc = sfxge_sram_buf_tbl_set(sp, stp->st_id, esmp,
1184 	    EFX_TXQ_NBUFS(SFXGE_TX_NDESCS))) != 0)
1185 		goto fail1;
1186 
1187 	switch (stp->st_type) {
1188 	case SFXGE_TXQ_NON_CKSUM:
1189 		flags = 0;
1190 		break;
1191 
1192 	case SFXGE_TXQ_IP_CKSUM:
1193 		flags = EFX_TXQ_CKSUM_IPV4;
1194 		break;
1195 
1196 	case SFXGE_TXQ_IP_TCP_UDP_CKSUM:
1197 		flags = EFX_TXQ_CKSUM_IPV4 | EFX_TXQ_CKSUM_TCPUDP;
1198 		break;
1199 
1200 	default:
1201 		ASSERT(B_FALSE);
1202 
1203 		flags = 0;
1204 		break;
1205 	}
1206 
1207 	/* Create the transmit queue */
1208 	if ((rc = efx_tx_qcreate(enp, index, stp->st_label, esmp,
1209 	    SFXGE_TX_NDESCS, stp->st_id, flags, sep->se_eep,
1210 	    &(stp->st_etp), &desc_index)) != 0)
1211 		goto fail2;
1212 
1213 	/* Initialise queue descriptor indexes */
1214 	stp->st_added = desc_index;
1215 	stp->st_pending = desc_index;
1216 	stp->st_completed = desc_index;
1217 	stp->st_reaped = desc_index;
1218 
1219 	/* Enable the transmit queue */
1220 	efx_tx_qenable(stp->st_etp);
1221 
1222 	stp->st_state = SFXGE_TXQ_STARTED;
1223 
1224 	mutex_exit(&(stp->st_lock));
1225 
1226 	return (0);
1227 
1228 fail2:
1229 	DTRACE_PROBE(fail2);
1230 
1231 	/* Clear entries from the buffer table */
1232 	sfxge_sram_buf_tbl_clear(sp, stp->st_id,
1233 	    EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));
1234 
1235 fail1:
1236 	DTRACE_PROBE1(fail1, int, rc);
1237 
1238 	mutex_exit(&(stp->st_lock));
1239 
1240 	return (rc);
1241 }
1242 
1243 static inline int
1244 sfxge_tx_qmapping_add(sfxge_txq_t *stp, sfxge_tx_mapping_t *stmp,
1245     size_t *offp, size_t *limitp)
1246 {
1247 	mblk_t *mp;
1248 	size_t mapping_off;
1249 	size_t mapping_size;
1250 	int rc;
1251 
1252 	ASSERT3U(*offp, <, stmp->stm_size);
1253 	ASSERT(*limitp != 0);
1254 
1255 	mp = stmp->stm_mp;
1256 
1257 	ASSERT3P(stmp->stm_base, ==, mp->b_rptr);
1258 	ASSERT3U(stmp->stm_size, ==, MBLKL(mp));
1259 
1260 	mapping_off = stmp->stm_off + *offp;
1261 	mapping_size = stmp->stm_size - *offp;
1262 
1263 	while (mapping_size != 0 && *limitp != 0) {
1264 		size_t page =
1265 		    mapping_off >> SFXGE_TX_DESCSHIFT;
1266 		size_t page_off =
1267 		    mapping_off & SFXGE_TX_DESCOFFSET;
1268 		size_t page_size =
1269 		    SFXGE_TX_DESCSIZE - page_off;
1270 		efx_buffer_t *ebp;
1271 
1272 		ASSERT3U(page, <, SFXGE_TX_MAPPING_NADDR);
1273 		ASSERT((stmp->stm_addr[page] & SFXGE_TX_DESCMASK) != 0);
1274 
1275 		page_size = MIN(page_size, mapping_size);
1276 		page_size = MIN(page_size, *limitp);
1277 
1278 		ASSERT3U(stp->st_n, <=,
1279 		    EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
1280 		if (stp->st_n ==
1281 		    EFX_TXQ_LIMIT(SFXGE_TX_NDESCS)) {
1282 			rc = ENOSPC;
1283 			goto fail1;
1284 		}
1285 
1286 		ebp = &(stp->st_eb[stp->st_n++]);
1287 		ebp->eb_addr = stmp->stm_addr[page] +
1288 		    page_off;
1289 		ebp->eb_size = page_size;
1290 
1291 		*offp += page_size;
1292 		*limitp -= page_size;
1293 
1294 		mapping_off += page_size;
1295 		mapping_size -= page_size;
1296 
1297 		ebp->eb_eop = (*limitp == 0 ||
1298 		    (mapping_size == 0 && mp->b_cont == NULL));
1299 
1300 		DTRACE_PROBE5(tx_mapping_add,
1301 		    unsigned int, stp->st_index,
1302 		    unsigned int, stp->st_n - 1,
1303 		    uint64_t, ebp->eb_addr,
1304 		    size_t, ebp->eb_size,
1305 		    boolean_t, ebp->eb_eop);
1306 	}
1307 
1308 	ASSERT3U(*offp, <=, stmp->stm_size);
1309 
1310 	return (0);
1311 
1312 fail1:
1313 	DTRACE_PROBE1(fail1, int, rc);
1314 
1315 	return (rc);
1316 }
1317 
1318 static inline int
1319 sfxge_tx_qbuffer_add(sfxge_txq_t *stp, sfxge_tx_buffer_t *stbp, boolean_t eop)
1320 {
1321 	efx_buffer_t *ebp;
1322 	int rc;
1323 
1324 	ASSERT3U(stp->st_n, <=,
1325 	    EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
1326 	if (stp->st_n == EFX_TXQ_LIMIT(SFXGE_TX_NDESCS)) {
1327 		rc = ENOSPC;
1328 		goto fail1;
1329 	}
1330 
1331 	ebp = &(stp->st_eb[stp->st_n++]);
1332 	ebp->eb_addr = stbp->stb_esm.esm_addr + stbp->stb_off;
1333 	ebp->eb_size = stbp->stb_esm.esm_used - stbp->stb_off;
1334 	ebp->eb_eop = eop;
1335 
1336 	(void) ddi_dma_sync(stbp->stb_esm.esm_dma_handle,
1337 	    stbp->stb_off, ebp->eb_size,
1338 	    DDI_DMA_SYNC_FORDEV);
1339 
1340 	stbp->stb_off = stbp->stb_esm.esm_used;
1341 
1342 	DTRACE_PROBE5(tx_buffer_add,
1343 	    unsigned int, stp->st_index,
1344 	    unsigned int, stp->st_n - 1,
1345 	    uint64_t, ebp->eb_addr, size_t, ebp->eb_size,
1346 	    boolean_t, ebp->eb_eop);
1347 
1348 	return (0);
1349 
1350 fail1:
1351 	DTRACE_PROBE1(fail1, int, rc);
1352 
1353 	return (rc);
1354 }
1355 
1356 static inline boolean_t
1357 sfxge_tx_msgb_copy(mblk_t *mp, sfxge_tx_buffer_t *stbp, size_t *offp,
1358     size_t *limitp)
1359 {
1360 	size_t data_off;
1361 	size_t data_size;
1362 	size_t copy_off;
1363 	size_t copy_size;
1364 	boolean_t eop;
1365 
1366 	ASSERT3U(*offp, <=, MBLKL(mp));
1367 	ASSERT(*limitp != 0);
1368 
1369 	data_off = *offp;
1370 	data_size = MBLKL(mp) - *offp;
1371 
1372 	copy_off = stbp->stb_esm.esm_used;
1373 	copy_size = SFXGE_TX_BUFFER_SIZE - copy_off;
1374 
1375 	copy_size = MIN(copy_size, data_size);
1376 	copy_size = MIN(copy_size, *limitp);
1377 
1378 	bcopy(mp->b_rptr + data_off,
1379 	    stbp->stb_esm.esm_base + copy_off, copy_size);
1380 
1381 	stbp->stb_esm.esm_used += copy_size;
1382 	ASSERT3U(stbp->stb_esm.esm_used, <=,
1383 	    SFXGE_TX_BUFFER_SIZE);
1384 
1385 	*offp += copy_size;
1386 	*limitp -= copy_size;
1387 
1388 	data_off += copy_size;
1389 	data_size -= copy_size;
1390 
1391 	eop = (*limitp == 0 ||
1392 	    (data_size == 0 && mp->b_cont == NULL));
1393 
1394 	ASSERT3U(*offp, <=, MBLKL(mp));
1395 
1396 	return (eop);
1397 }
1398 
1399 static int
1400 sfxge_tx_qpayload_fragment(sfxge_txq_t *stp, unsigned int id, mblk_t **mpp,
1401     size_t *offp, size_t size, boolean_t copy)
1402 {
1403 	sfxge_t *sp = stp->st_sp;
1404 	mblk_t *mp = *mpp;
1405 	size_t off = *offp;
1406 	sfxge_tx_buffer_t *stbp;
1407 	sfxge_tx_mapping_t *stmp;
1408 	int rc;
1409 
1410 	stbp = stp->st_stbp[id];
1411 	ASSERT(stbp == NULL || (stbp->stb_esm.esm_used == stbp->stb_off));
1412 
1413 	stmp = stp->st_stmp[id];
1414 
1415 	while (size != 0) {
1416 		boolean_t eop;
1417 
1418 		ASSERT(mp != NULL);
1419 
1420 		if (mp->b_cont != NULL)
1421 			prefetch_read_many(mp->b_cont);
1422 
1423 		ASSERT3U(off, <, MBLKL(mp));
1424 
1425 		if (copy)
1426 			goto copy;
1427 
1428 		/*
1429 		 * Check whether we have already mapped this data block for
1430 		 * DMA.
1431 		 */
1432 		if (stmp == NULL || stmp->stm_mp != mp) {
1433 			/*
1434 			 * If we are part way through copying a data block then
1435 			 * there's no point in trying to map it for DMA.
1436 			 */
1437 			if (off != 0)
1438 				goto copy;
1439 
1440 			/*
1441 			 * If the data block is too short then the cost of
1442 			 * mapping it for DMA would outweigh the cost of
1443 			 * copying it.
1444 			 */
1445 			if (MBLKL(mp) < SFXGE_TX_COPY_THRESHOLD)
1446 				goto copy;
1447 
1448 			/* Try to grab a transmit mapping from the pool */
1449 			stmp = sfxge_tx_qfmp_get(stp);
1450 			if (stmp == NULL) {
1451 				/*
1452 				 * The pool was empty so allocate a new
1453 				 * mapping.
1454 				 */
1455 				if ((stmp = kmem_cache_alloc(sp->s_tmc,
1456 				    KM_NOSLEEP)) == NULL)
1457 					goto copy;
1458 			}
1459 
1460 			/* Add the DMA mapping to the list */
1461 			stmp->stm_next = stp->st_stmp[id];
1462 			stp->st_stmp[id] = stmp;
1463 
1464 			/* Try to bind the data block to the mapping */
1465 			if (sfxge_tx_msgb_bind(mp, stmp) != 0)
1466 				goto copy;
1467 		}
1468 		ASSERT3P(stmp->stm_mp, ==, mp);
1469 
1470 		/*
1471 		 * If we have a partially filled buffer then we must add it to
1472 		 * the fragment list before adding the mapping.
1473 		 */
1474 		if (stbp != NULL && (stbp->stb_esm.esm_used > stbp->stb_off)) {
1475 			rc = sfxge_tx_qbuffer_add(stp, stbp, B_FALSE);
1476 			if (rc != 0)
1477 				goto fail1;
1478 		}
1479 
1480 		/* Add the mapping to the fragment list */
1481 		rc = sfxge_tx_qmapping_add(stp, stmp, &off, &size);
1482 		if (rc != 0)
1483 			goto fail2;
1484 
1485 		ASSERT(off == MBLKL(mp) || size == 0);
1486 
1487 		/*
1488 		 * If the data block has been exhausted then Skip over the
1489 		 * control block and advance to the next data block.
1490 		 */
1491 		if (off == MBLKL(mp)) {
1492 			mp = mp->b_cont;
1493 			off = 0;
1494 		}
1495 
1496 		continue;
1497 
1498 copy:
1499 		if (stbp == NULL ||
1500 		    stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE) {
1501 			/* Try to grab a buffer from the pool */
1502 			stbp = sfxge_tx_qfbp_get(stp);
1503 			if (stbp == NULL) {
1504 				/*
1505 				 * The pool was empty so allocate a new
1506 				 * buffer.
1507 				 */
1508 				if ((stbp = kmem_cache_alloc(sp->s_tbc,
1509 				    KM_NOSLEEP)) == NULL) {
1510 					rc = ENOMEM;
1511 					goto fail3;
1512 				}
1513 			}
1514 
1515 			/* Add it to the list */
1516 			stbp->stb_next = stp->st_stbp[id];
1517 			stp->st_stbp[id] = stbp;
1518 		}
1519 
1520 		/* Copy as much of the data block as we can into the buffer */
1521 		eop = sfxge_tx_msgb_copy(mp, stbp, &off, &size);
1522 
1523 		ASSERT(off == MBLKL(mp) || size == 0 ||
1524 		    stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE);
1525 
1526 		/*
1527 		 * If we have reached the end of the packet, or the buffer is
1528 		 * full, then add the buffer to the fragment list.
1529 		 */
1530 		if (stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE || eop) {
1531 			rc = sfxge_tx_qbuffer_add(stp, stbp, eop);
1532 			if (rc != 0)
1533 				goto fail4;
1534 		}
1535 
1536 		/*
1537 		 * If the data block has been exhaused then advance to the next
1538 		 * one.
1539 		 */
1540 		if (off == MBLKL(mp)) {
1541 			mp = mp->b_cont;
1542 			off = 0;
1543 		}
1544 	}
1545 
1546 	*mpp = mp;
1547 	*offp = off;
1548 
1549 	return (0);
1550 
1551 fail4:
1552 	DTRACE_PROBE(fail4);
1553 fail3:
1554 	DTRACE_PROBE(fail3);
1555 fail2:
1556 	DTRACE_PROBE(fail2);
1557 fail1:
1558 	DTRACE_PROBE1(fail1, int, rc);
1559 
1560 	return (rc);
1561 }
1562 
1563 static int
1564 sfxge_tx_qlso_fragment(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp,
1565     boolean_t copy)
1566 {
1567 	sfxge_t *sp = stp->st_sp;
1568 	mblk_t *mp = stpp->stp_mp;
1569 	struct ether_header *etherhp = stpp->stp_etherhp;
1570 	struct ip *iphp = stpp->stp_iphp;
1571 	struct tcphdr *thp = stpp->stp_thp;
1572 	size_t size = stpp->stp_size;
1573 	size_t off = stpp->stp_off;
1574 	size_t mss = stpp->stp_mss;
1575 	unsigned int id;
1576 	caddr_t hp;
1577 	size_t ehs, hs;
1578 	uint16_t start_len;
1579 	uint16_t start_id;
1580 	uint16_t ip_id;
1581 	uint8_t start_flags;
1582 	uint32_t start_seq;
1583 	uint32_t th_seq;
1584 	size_t lss;
1585 	sfxge_tx_buffer_t *stbp;
1586 	int rc;
1587 
1588 	ASSERT(mutex_owned(&(stp->st_lock)));
1589 
1590 	if ((DB_LSOFLAGS(mp) & HW_LSO) == 0) {
1591 		rc = EINVAL;
1592 		goto fail1;
1593 	}
1594 
1595 	id = stp->st_added & (SFXGE_TX_NDESCS - 1);
1596 
1597 	ASSERT(stp->st_n == 0);
1598 	ASSERT(stp->st_stbp[id] == NULL);
1599 	ASSERT(stp->st_stmp[id] == NULL);
1600 
1601 	ehs = (etherhp->ether_type == htons(ETHERTYPE_VLAN)) ?
1602 	    sizeof (struct ether_vlan_header) :
1603 	    sizeof (struct ether_header);
1604 	if (msgdsize(mp) != ehs + ntohs(iphp->ip_len)) {
1605 		rc = EINVAL;
1606 		goto fail2;
1607 	}
1608 
1609 	/* The payload offset is equivalent to the size of the headers */
1610 	hp = (caddr_t)(mp->b_rptr);
1611 	hs = off;
1612 
1613 	/*
1614 	 * If the initial data block only contains the headers then advance
1615 	 * to the next one.
1616 	 */
1617 	if (hs > MBLKL(mp)) {
1618 		rc = EINVAL;
1619 		goto fail3;
1620 	}
1621 	mp->b_rptr += hs;
1622 
1623 	if (MBLKL(mp) == 0)
1624 		mp = mp->b_cont;
1625 
1626 	off = 0;
1627 
1628 	/* Check IP and TCP headers are suitable for LSO */
1629 	if (((iphp->ip_off & ~htons(IP_DF)) != 0) ||
1630 	    ((thp->th_flags & (TH_URG | TH_SYN)) != 0) ||
1631 	    (thp->th_urp != 0)) {
1632 		rc = EINVAL;
1633 		goto fail4;
1634 	}
1635 
1636 	if (size + (thp->th_off << 2) + (iphp->ip_hl << 2) !=
1637 	    ntohs(iphp->ip_len)) {
1638 		rc = EINVAL;
1639 		goto fail4;
1640 	}
1641 
1642 	/*
1643 	 * Get the base IP id, The stack leaves enough of a gap in id space
1644 	 * for us to increment this for each segment we send out.
1645 	 */
1646 	start_len = ntohs(iphp->ip_len);
1647 	start_id = ip_id = ntohs(iphp->ip_id);
1648 
1649 	/* Get the base TCP sequence number and flags */
1650 	start_flags = thp->th_flags;
1651 	start_seq = th_seq = ntohl(thp->th_seq);
1652 
1653 	/* Adjust the header for interim segments */
1654 	iphp->ip_len = htons((iphp->ip_hl << 2) + (thp->th_off << 2) + mss);
1655 	thp->th_flags = start_flags & ~(TH_PUSH | TH_FIN);
1656 
1657 	lss = size;
1658 	if ((lss / mss) >= (EFX_TXQ_LIMIT(SFXGE_TX_NDESCS) / 2)) {
1659 		rc = EINVAL;
1660 		goto fail5;
1661 	}
1662 
1663 	stbp = NULL;
1664 	while (lss != 0) {
1665 		size_t ss = MIN(lss, mss);
1666 		boolean_t eol = (ss == lss);
1667 
1668 		/* Adjust the header for this segment */
1669 		iphp->ip_id = htons(ip_id);
1670 		ip_id++;
1671 
1672 		thp->th_seq = htonl(th_seq);
1673 		th_seq += ss;
1674 
1675 		/* If this is the final segment then do some extra adjustment */
1676 		if (eol) {
1677 			iphp->ip_len = htons((iphp->ip_hl << 2) +
1678 			    (thp->th_off << 2) + ss);
1679 			thp->th_flags = start_flags;
1680 		}
1681 
1682 		if (stbp == NULL ||
1683 		    stbp->stb_esm.esm_used + hs > SFXGE_TX_BUFFER_SIZE) {
1684 			/* Try to grab a buffer from the pool */
1685 			stbp = sfxge_tx_qfbp_get(stp);
1686 			if (stbp == NULL) {
1687 				/*
1688 				 * The pool was empty so allocate a new
1689 				 * buffer.
1690 				 */
1691 				if ((stbp = kmem_cache_alloc(sp->s_tbc,
1692 				    KM_NOSLEEP)) == NULL) {
1693 					rc = ENOMEM;
1694 					goto fail6;
1695 				}
1696 			}
1697 
1698 			/* Add it to the list */
1699 			stbp->stb_next = stp->st_stbp[id];
1700 			stp->st_stbp[id] = stbp;
1701 		}
1702 
1703 		/* Copy in the headers */
1704 		ASSERT3U(stbp->stb_off, ==, stbp->stb_esm.esm_used);
1705 		bcopy(hp, stbp->stb_esm.esm_base + stbp->stb_off, hs);
1706 		stbp->stb_esm.esm_used += hs;
1707 
1708 		/* Add the buffer to the fragment list */
1709 		rc = sfxge_tx_qbuffer_add(stp, stbp, B_FALSE);
1710 		if (rc != 0)
1711 			goto fail7;
1712 
1713 		/* Add the payload to the fragment list */
1714 		if ((rc = sfxge_tx_qpayload_fragment(stp, id, &mp, &off,
1715 		    ss, copy)) != 0)
1716 			goto fail8;
1717 
1718 		lss -= ss;
1719 	}
1720 	ASSERT3U(off, ==, 0);
1721 	ASSERT3P(mp, ==, NULL);
1722 
1723 	ASSERT3U(th_seq - start_seq, ==, size);
1724 
1725 	/*
1726 	 * If no part of the packet has been mapped for DMA then we can free
1727 	 * it now, otherwise it can only be freed on completion.
1728 	 */
1729 	if (stp->st_stmp[id] == NULL)
1730 		freemsg(stpp->stp_mp);
1731 	else
1732 		stp->st_mp[id] = stpp->stp_mp;
1733 
1734 	stpp->stp_mp = NULL;
1735 
1736 	return (0);
1737 
1738 fail8:
1739 	DTRACE_PROBE(fail8);
1740 fail7:
1741 	DTRACE_PROBE(fail7);
1742 fail6:
1743 	DTRACE_PROBE(fail6);
1744 fail5:
1745 	DTRACE_PROBE(fail5);
1746 
1747 	/* Restore the header */
1748 	thp->th_seq = htonl(start_seq);
1749 	thp->th_flags = start_flags;
1750 
1751 	iphp->ip_len = htons(start_len);
1752 	iphp->ip_id = htons(start_id);
1753 
1754 fail4:
1755 	DTRACE_PROBE(fail4);
1756 
1757 	mp = stpp->stp_mp;
1758 	mp->b_rptr -= hs;
1759 
1760 	ASSERT3U(((etherhp->ether_type == htons(ETHERTYPE_VLAN)) ?
1761 	    sizeof (struct ether_vlan_header) :
1762 	    sizeof (struct ether_header)) +
1763 	    ntohs(iphp->ip_len), ==, msgdsize(mp));
1764 
1765 	ASSERT(stp->st_mp[id] == NULL);
1766 
1767 fail3:
1768 	DTRACE_PROBE(fail3);
1769 fail2:
1770 	DTRACE_PROBE(fail2);
1771 fail1:
1772 	DTRACE_PROBE1(fail1, int, rc);
1773 
1774 	return (rc);
1775 }
1776 
1777 static int
1778 sfxge_tx_qpacket_fragment(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp,
1779     boolean_t copy)
1780 {
1781 	sfxge_t *sp = stp->st_sp;
1782 	mblk_t *mp = stpp->stp_mp;
1783 	unsigned int id;
1784 	size_t off;
1785 	size_t size;
1786 	sfxge_tx_mapping_t *stmp;
1787 	sfxge_tx_buffer_t *stbp;
1788 	int rc;
1789 
1790 	ASSERT(mutex_owned(&(stp->st_lock)));
1791 
1792 	ASSERT(stp->st_n == 0);
1793 
1794 	id = stp->st_added & (SFXGE_TX_NDESCS - 1);
1795 
1796 	ASSERT(stp->st_stbp[id] == NULL);
1797 	ASSERT(stp->st_stmp[id] == NULL);
1798 
1799 	off = 0;
1800 	size = LONG_MAX;	/* must be larger than the packet */
1801 
1802 	stbp = NULL;
1803 	stmp = NULL;
1804 
1805 	while (mp != NULL) {
1806 		boolean_t eop;
1807 
1808 		ASSERT(mp != NULL);
1809 
1810 		if (mp->b_cont != NULL)
1811 			prefetch_read_many(mp->b_cont);
1812 
1813 		ASSERT(stmp == NULL || stmp->stm_mp != mp);
1814 
1815 		if (copy)
1816 			goto copy;
1817 
1818 		/*
1819 		 * If we are part way through copying a data block then there's
1820 		 * no point in trying to map it for DMA.
1821 		 */
1822 		if (off != 0)
1823 			goto copy;
1824 
1825 		/*
1826 		 * If the data block is too short then the cost of mapping it
1827 		 * for DMA would outweigh the cost of copying it.
1828 		 *
1829 		 * TX copy break
1830 		 */
1831 		if (MBLKL(mp) < SFXGE_TX_COPY_THRESHOLD)
1832 			goto copy;
1833 
1834 		/* Try to grab a transmit mapping from the pool */
1835 		stmp = sfxge_tx_qfmp_get(stp);
1836 		if (stmp == NULL) {
1837 			/*
1838 			 * The pool was empty so allocate a new
1839 			 * mapping.
1840 			 */
1841 			if ((stmp = kmem_cache_alloc(sp->s_tmc,
1842 			    KM_NOSLEEP)) == NULL)
1843 				goto copy;
1844 		}
1845 
1846 		/* Add the DMA mapping to the list */
1847 		stmp->stm_next = stp->st_stmp[id];
1848 		stp->st_stmp[id] = stmp;
1849 
1850 		/* Try to bind the data block to the mapping */
1851 		if (sfxge_tx_msgb_bind(mp, stmp) != 0)
1852 			goto copy;
1853 
1854 		/*
1855 		 * If we have a partially filled buffer then we must add it to
1856 		 * the fragment list before adding the mapping.
1857 		 */
1858 		if (stbp != NULL && (stbp->stb_esm.esm_used > stbp->stb_off)) {
1859 			rc = sfxge_tx_qbuffer_add(stp, stbp, B_FALSE);
1860 			if (rc != 0)
1861 				goto fail1;
1862 		}
1863 
1864 		/* Add the mapping to the fragment list */
1865 		rc = sfxge_tx_qmapping_add(stp, stmp, &off, &size);
1866 		if (rc != 0)
1867 			goto fail2;
1868 
1869 		ASSERT3U(off, ==, MBLKL(mp));
1870 
1871 		/* Advance to the next data block */
1872 		mp = mp->b_cont;
1873 		off = 0;
1874 		continue;
1875 
1876 copy:
1877 		if (stbp == NULL ||
1878 		    stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE) {
1879 			/* Try to grab a buffer from the pool */
1880 			stbp = sfxge_tx_qfbp_get(stp);
1881 			if (stbp == NULL) {
1882 				/*
1883 				 * The pool was empty so allocate a new
1884 				 * buffer.
1885 				 */
1886 				if ((stbp = kmem_cache_alloc(sp->s_tbc,
1887 				    KM_NOSLEEP)) == NULL) {
1888 					rc = ENOMEM;
1889 					goto fail3;
1890 				}
1891 			}
1892 
1893 			/* Add it to the list */
1894 			stbp->stb_next = stp->st_stbp[id];
1895 			stp->st_stbp[id] = stbp;
1896 		}
1897 
1898 		/* Copy as much of the data block as we can into the buffer */
1899 		eop = sfxge_tx_msgb_copy(mp, stbp, &off, &size);
1900 
1901 		ASSERT(off == MBLKL(mp) ||
1902 		    stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE);
1903 
1904 		/*
1905 		 * If we have reached the end of the packet, or the buffer is
1906 		 * full, then add the buffer to the fragment list.
1907 		 */
1908 		if (stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE || eop) {
1909 			rc = sfxge_tx_qbuffer_add(stp, stbp, eop);
1910 			if (rc != 0)
1911 				goto fail4;
1912 		}
1913 
1914 		/*
1915 		 * If the data block has been exhaused then advance to the next
1916 		 * one.
1917 		 */
1918 		if (off == MBLKL(mp)) {
1919 			mp = mp->b_cont;
1920 			off = 0;
1921 		}
1922 	}
1923 	ASSERT3U(off, ==, 0);
1924 	ASSERT3P(mp, ==, NULL);
1925 	ASSERT3U(size, !=, 0);
1926 
1927 	/*
1928 	 * If no part of the packet has been mapped for DMA then we can free
1929 	 * it now, otherwise it can only be freed on completion.
1930 	 */
1931 	if (stp->st_stmp[id] == NULL)
1932 		freemsg(stpp->stp_mp);
1933 	else
1934 		stp->st_mp[id] = stpp->stp_mp;
1935 
1936 	stpp->stp_mp = NULL;
1937 
1938 	return (0);
1939 
1940 fail4:
1941 	DTRACE_PROBE(fail4);
1942 fail3:
1943 	DTRACE_PROBE(fail3);
1944 fail2:
1945 	DTRACE_PROBE(fail2);
1946 fail1:
1947 	DTRACE_PROBE1(fail1, int, rc);
1948 
1949 	ASSERT(stp->st_stmp[id] == NULL);
1950 
1951 	return (rc);
1952 }
1953 
1954 
1955 #define	SFXGE_TX_QDPL_PUT_PENDING(_stp)					\
1956 	((_stp)->st_dpl.std_put != 0)
1957 
1958 static void
1959 sfxge_tx_qdpl_swizzle(sfxge_txq_t *stp)
1960 {
1961 	sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
1962 	volatile uintptr_t *putp;
1963 	uintptr_t put;
1964 	sfxge_tx_packet_t *stpp;
1965 	sfxge_tx_packet_t *p;
1966 	sfxge_tx_packet_t **pp;
1967 	unsigned int count;
1968 
1969 	ASSERT(mutex_owned(&(stp->st_lock)));
1970 
1971 	/*
1972 	 * Guaranteed that in flight TX packets will cause more TX completions
1973 	 * hence more swizzles must happen
1974 	 */
1975 	ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));
1976 	if (stdp->std_count >= stdp->get_pkt_limit)
1977 		return;
1978 
1979 	/* Acquire the put list - replacing with an empty list */
1980 	putp = &(stdp->std_put);
1981 	put = atomic_swap_ulong(putp, 0);
1982 	stpp = (void *)put;
1983 
1984 	if (stpp == NULL)
1985 		return;
1986 
1987 	/* Reverse the list */
1988 	pp = &(stpp->stp_next);
1989 	p = NULL;
1990 
1991 	count = 0;
1992 	do {
1993 		sfxge_tx_packet_t *next;
1994 
1995 		next = stpp->stp_next;
1996 
1997 		stpp->stp_next = p;
1998 		p = stpp;
1999 
2000 		count++;
2001 		stpp = next;
2002 	} while (stpp != NULL);
2003 
2004 	/* Add it to the tail of the get list */
2005 	ASSERT3P(*pp, ==, NULL);
2006 
2007 	*(stdp->std_getp) = p;
2008 	stdp->std_getp = pp;
2009 	stdp->std_count += count;
2010 	ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));
2011 
2012 	DTRACE_PROBE2(dpl_counts, int, stdp->std_count, int, count);
2013 }
2014 
2015 
2016 /*
2017  * If TXQ locked, add the RX DPL put list and this packet to the TX DPL get list
2018  * If TXQ unlocked, atomically add this packet to TX DPL put list
2019  *
2020  * The only possible error is ENOSPC (used for TX backpressure)
2021  * For the TX DPL put or get list becoming full, in both cases there must be
2022  * future TX completions (as represented by the packets on the DPL get lists).
2023  *
2024  * This ensures that in the future mac_tx_update() will be called from
2025  * sfxge_tx_qcomplete()
2026  */
2027 static inline int
2028 sfxge_tx_qdpl_add(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp, int locked)
2029 {
2030 	sfxge_tx_dpl_t *stdp = &stp->st_dpl;
2031 
2032 	ASSERT3P(stpp->stp_next, ==, NULL);
2033 
2034 	if (locked) {
2035 		ASSERT(mutex_owned(&stp->st_lock));
2036 
2037 		if (stdp->std_count >= stdp->get_pkt_limit) {
2038 			stdp->get_full_count++;
2039 			return (ENOSPC);
2040 		}
2041 
2042 		/* Reverse the put list onto the get list */
2043 		sfxge_tx_qdpl_swizzle(stp);
2044 
2045 		/* Add to the tail of the get list */
2046 		*(stdp->std_getp) = stpp;
2047 		stdp->std_getp = &stpp->stp_next;
2048 		stdp->std_count++;
2049 		ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));
2050 
2051 	} else {
2052 		volatile uintptr_t *putp;
2053 		uintptr_t old;
2054 		uintptr_t new;
2055 		sfxge_tx_packet_t *old_pkt;
2056 
2057 		putp = &(stdp->std_put);
2058 		new = (uintptr_t)stpp;
2059 
2060 		/* Add to the head of the put list, keeping a list length */
2061 		do {
2062 			old = *putp;
2063 			old_pkt =  (sfxge_tx_packet_t *)old;
2064 
2065 			stpp->stp_dpl_put_len = old ?
2066 			    old_pkt->stp_dpl_put_len + 1 : 1;
2067 
2068 			if (stpp->stp_dpl_put_len >= stdp->put_pkt_limit) {
2069 				stpp->stp_next = 0;
2070 				stpp->stp_dpl_put_len = 0;
2071 				stdp->put_full_count++;
2072 				return (ENOSPC);
2073 			}
2074 
2075 			stpp->stp_next = (void *)old;
2076 		} while (atomic_cas_ulong(putp, old, new) != old);
2077 	}
2078 	return (0);
2079 }
2080 
2081 
2082 /* Take all packets from DPL get list and try to send to HW */
2083 static void
2084 sfxge_tx_qdpl_drain(sfxge_txq_t *stp)
2085 {
2086 	sfxge_t *sp = stp->st_sp;
2087 	sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
2088 	unsigned int pushed = stp->st_added;
2089 	sfxge_tx_packet_t *stpp;
2090 	unsigned int count;
2091 
2092 	ASSERT(mutex_owned(&(stp->st_lock)));
2093 
2094 	prefetch_read_many(sp->s_enp);
2095 	prefetch_read_many(stp->st_etp);
2096 
2097 	stpp = stdp->std_get;
2098 	count = stdp->std_count;
2099 
2100 	while (count != 0) {
2101 		sfxge_tx_packet_t *next;
2102 		boolean_t copy;
2103 		int rc;
2104 
2105 		ASSERT(stpp != NULL);
2106 
2107 		/* Split stpp off */
2108 		next = stpp->stp_next;
2109 		stpp->stp_next = NULL;
2110 
2111 		if (next != NULL)
2112 			prefetch_read_many(next);
2113 
2114 		if (stp->st_state != SFXGE_TXQ_STARTED)
2115 			goto reject;
2116 
2117 		copy = B_FALSE;
2118 
2119 again:
2120 		/* Fragment the packet */
2121 		if (stpp->stp_mss != 0) {
2122 			rc = sfxge_tx_qlso_fragment(stp, stpp, copy);
2123 		} else {
2124 			rc = sfxge_tx_qpacket_fragment(stp, stpp, copy);
2125 		}
2126 
2127 		switch (rc) {
2128 		case 0:
2129 			break;
2130 
2131 		case ENOSPC:
2132 			if (!copy)
2133 				goto copy;
2134 
2135 		/*FALLTHRU*/
2136 		default:
2137 			goto reject;
2138 		}
2139 
2140 		/* Free the packet structure */
2141 		stpp->stp_etherhp = NULL;
2142 		stpp->stp_iphp = NULL;
2143 		stpp->stp_thp = NULL;
2144 		stpp->stp_off = 0;
2145 		stpp->stp_size = 0;
2146 		stpp->stp_mss = 0;
2147 		stpp->stp_dpl_put_len = 0;
2148 
2149 		ASSERT3P(stpp->stp_mp, ==, NULL);
2150 
2151 		if (sfxge_tx_qfpp_put(stp, stpp) != 0) {
2152 			sfxge_tx_packet_destroy(sp, stpp);
2153 			stpp = NULL;
2154 		}
2155 
2156 		--count;
2157 		stpp = next;
2158 
2159 		/* Post the packet */
2160 		sfxge_tx_qlist_post(stp);
2161 
2162 		if (stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED)
2163 			goto defer;
2164 
2165 		if (stp->st_added - pushed >= SFXGE_TX_BATCH) {
2166 			efx_tx_qpush(stp->st_etp, stp->st_added, pushed);
2167 			pushed = stp->st_added;
2168 		}
2169 
2170 		continue;
2171 
2172 copy:
2173 		/* Abort the current fragment list */
2174 		sfxge_tx_qlist_abort(stp);
2175 
2176 		/* Try copying the packet to flatten it */
2177 		ASSERT(!copy);
2178 		copy = B_TRUE;
2179 
2180 		goto again;
2181 
2182 reject:
2183 		/* Abort the current fragment list */
2184 		sfxge_tx_qlist_abort(stp);
2185 
2186 		/* Discard the packet */
2187 		freemsg(stpp->stp_mp);
2188 		stpp->stp_mp = NULL;
2189 
2190 		/* Free the packet structure */
2191 		stpp->stp_etherhp = NULL;
2192 		stpp->stp_iphp = NULL;
2193 		stpp->stp_thp = NULL;
2194 		stpp->stp_off = 0;
2195 		stpp->stp_size = 0;
2196 		stpp->stp_mss = 0;
2197 		stpp->stp_dpl_put_len = 0;
2198 
2199 		if (sfxge_tx_qfpp_put(stp, stpp) != 0) {
2200 			sfxge_tx_packet_destroy(sp, stpp);
2201 			stpp = NULL;
2202 		}
2203 
2204 		--count;
2205 		stpp = next;
2206 		continue;
2207 defer:
2208 		DTRACE_PROBE1(defer, unsigned int, stp->st_index);
2209 		break;
2210 	}
2211 
2212 	if (count == 0) {
2213 		/* New empty get list */
2214 		ASSERT3P(stpp, ==, NULL);
2215 		stdp->std_get = NULL;
2216 		stdp->std_count = 0;
2217 
2218 		stdp->std_getp = &(stdp->std_get);
2219 	} else {
2220 		/* shorten the list by moving the head */
2221 		stdp->std_get = stpp;
2222 		stdp->std_count = count;
2223 		ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));
2224 	}
2225 
2226 	if (stp->st_added != pushed)
2227 		efx_tx_qpush(stp->st_etp, stp->st_added, pushed);
2228 
2229 	ASSERT(stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED ||
2230 	    stdp->std_count == 0);
2231 }
2232 
2233 /* Swizzle deferred packet list, try and push to HW */
2234 static inline void
2235 sfxge_tx_qdpl_service(sfxge_txq_t *stp)
2236 {
2237 	do {
2238 		ASSERT(mutex_owned(&(stp->st_lock)));
2239 
2240 		if (SFXGE_TX_QDPL_PUT_PENDING(stp))
2241 			sfxge_tx_qdpl_swizzle(stp);
2242 
2243 		if (stp->st_unblock == SFXGE_TXQ_NOT_BLOCKED)
2244 			sfxge_tx_qdpl_drain(stp);
2245 
2246 		mutex_exit(&(stp->st_lock));
2247 
2248 		if (!SFXGE_TX_QDPL_PUT_PENDING(stp))
2249 			break;
2250 	} while (mutex_tryenter(&(stp->st_lock)));
2251 }
2252 
2253 static void
2254 sfxge_tx_qdpl_flush_locked(sfxge_txq_t *stp)
2255 {
2256 	sfxge_t *sp = stp->st_sp;
2257 	sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
2258 	sfxge_tx_packet_t *stpp;
2259 	unsigned int count;
2260 
2261 	ASSERT(mutex_owned(&(stp->st_lock)));
2262 
2263 	/* Swizzle put list to the get list */
2264 	sfxge_tx_qdpl_swizzle(stp);
2265 
2266 	stpp = stdp->std_get;
2267 	count = stdp->std_count;
2268 
2269 	while (count != 0) {
2270 		sfxge_tx_packet_t *next;
2271 
2272 		next = stpp->stp_next;
2273 		stpp->stp_next = NULL;
2274 
2275 		/* Discard the packet */
2276 		freemsg(stpp->stp_mp);
2277 		stpp->stp_mp = NULL;
2278 
2279 		/* Free the packet structure */
2280 		stpp->stp_etherhp = NULL;
2281 		stpp->stp_iphp = NULL;
2282 		stpp->stp_thp = NULL;
2283 		stpp->stp_off = 0;
2284 		stpp->stp_size = 0;
2285 		stpp->stp_mss = 0;
2286 		stpp->stp_dpl_put_len = 0;
2287 
2288 		sfxge_tx_packet_destroy(sp, stpp);
2289 
2290 		--count;
2291 		stpp = next;
2292 	}
2293 
2294 	ASSERT3P(stpp, ==, NULL);
2295 
2296 	/* Empty list */
2297 	stdp->std_get = NULL;
2298 	stdp->std_count = 0;
2299 	stdp->std_getp = &(stdp->std_get);
2300 }
2301 
2302 
2303 void
2304 sfxge_tx_qdpl_flush(sfxge_txq_t *stp)
2305 {
2306 	mutex_enter(&(stp->st_lock));
2307 	sfxge_tx_qdpl_flush_locked(stp);
2308 	mutex_exit(&(stp->st_lock));
2309 }
2310 
2311 
2312 static void
2313 sfxge_tx_qunblock(sfxge_txq_t *stp)
2314 {
2315 	sfxge_t *sp = stp->st_sp;
2316 	unsigned int evq = stp->st_evq;
2317 	sfxge_evq_t *sep = sp->s_sep[evq];
2318 
2319 	ASSERT(mutex_owned(&(sep->se_lock)));
2320 
2321 	mutex_enter(&(stp->st_lock));
2322 
2323 	if (stp->st_state != SFXGE_TXQ_STARTED) {
2324 		mutex_exit(&(stp->st_lock));
2325 		return;
2326 	}
2327 
2328 	if (stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED) {
2329 		unsigned int level;
2330 
2331 		level = stp->st_added - stp->st_completed;
2332 		if (level <= stp->st_unblock) {
2333 			stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;
2334 			sfxge_tx_qlist_post(stp);
2335 		}
2336 	}
2337 
2338 	sfxge_tx_qdpl_service(stp);
2339 	/* lock has been dropped */
2340 }
2341 
2342 void
2343 sfxge_tx_qcomplete(sfxge_txq_t *stp)
2344 {
2345 	sfxge_t *sp = stp->st_sp;
2346 	sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
2347 	unsigned int evq = stp->st_evq;
2348 	sfxge_evq_t *sep = sp->s_sep[evq];
2349 	unsigned int completed;
2350 
2351 	ASSERT(mutex_owned(&(sep->se_lock)));
2352 
2353 	completed = stp->st_completed;
2354 	while (completed != stp->st_pending) {
2355 		unsigned int id;
2356 		sfxge_tx_mapping_t *stmp;
2357 
2358 		id = completed++ & (SFXGE_TX_NDESCS - 1);
2359 
2360 		if ((stmp = stp->st_stmp[id]) != NULL) {
2361 			mblk_t *mp;
2362 
2363 			/* Unbind all the mappings */
2364 			do {
2365 				ASSERT(stmp->stm_mp != NULL);
2366 				sfxge_tx_msgb_unbind(stmp);
2367 
2368 				stmp = stmp->stm_next;
2369 			} while (stmp != NULL);
2370 
2371 			/*
2372 			 * Now that the packet is no longer mapped for DMA it
2373 			 * can be freed.
2374 			 */
2375 			mp = stp->st_mp[id];
2376 			stp->st_mp[id] = NULL;
2377 
2378 			ASSERT(mp != NULL);
2379 			freemsg(mp);
2380 		}
2381 	}
2382 	stp->st_completed = completed;
2383 
2384 	/* Check whether we need to unblock the queue */
2385 	if (stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED) {
2386 		unsigned int level;
2387 
2388 		level = stp->st_added - stp->st_completed;
2389 		if (level <= stp->st_unblock)
2390 			sfxge_tx_qunblock(stp);
2391 	}
2392 
2393 	/* Release TX backpressure from the TX DPL put/get list being full */
2394 	if (stdp->std_count < stdp->get_pkt_limit)
2395 		mac_tx_update(sp->s_mh);
2396 }
2397 
2398 void
2399 sfxge_tx_qflush_done(sfxge_txq_t *stp)
2400 {
2401 	sfxge_t *sp = stp->st_sp;
2402 	boolean_t flush_pending = B_FALSE;
2403 
2404 	ASSERT(mutex_owned(&(sp->s_sep[stp->st_evq]->se_lock)));
2405 
2406 	mutex_enter(&(stp->st_lock));
2407 
2408 	switch (stp->st_state) {
2409 	case SFXGE_TXQ_INITIALIZED:
2410 		/* Ignore flush event after TxQ destroyed */
2411 		break;
2412 
2413 	case SFXGE_TXQ_FLUSH_PENDING:
2414 		flush_pending = B_TRUE;
2415 		stp->st_state = SFXGE_TXQ_FLUSH_DONE;
2416 		break;
2417 
2418 	case SFXGE_TXQ_FLUSH_FAILED:
2419 		/* MC may have rebooted before handling the flush request */
2420 		stp->st_state = SFXGE_TXQ_FLUSH_DONE;
2421 		break;
2422 
2423 	case SFXGE_TXQ_STARTED:
2424 		/*
2425 		 * MC initiated flush on MC reboot or because of bad Tx
2426 		 * descriptor
2427 		 */
2428 		stp->st_state = SFXGE_TXQ_FLUSH_DONE;
2429 		break;
2430 
2431 	case SFXGE_TXQ_FLUSH_DONE:
2432 		/* Ignore unexpected extra flush event */
2433 		ASSERT(B_FALSE);
2434 		break;
2435 
2436 	default:
2437 		ASSERT(B_FALSE);
2438 	}
2439 
2440 
2441 	mutex_exit(&(stp->st_lock));
2442 
2443 	if (flush_pending == B_FALSE) {
2444 		/* Flush was not pending */
2445 		return;
2446 	}
2447 
2448 	mutex_enter(&(sp->s_tx_flush_lock));
2449 	sp->s_tx_flush_pending--;
2450 	if (sp->s_tx_flush_pending <= 0) {
2451 		/* All queues flushed: wakeup sfxge_tx_stop() */
2452 		cv_signal(&(sp->s_tx_flush_kv));
2453 	}
2454 	mutex_exit(&(sp->s_tx_flush_lock));
2455 }
2456 
2457 static void
2458 sfxge_tx_qflush(sfxge_t *sp, unsigned int index, boolean_t wait_for_flush)
2459 {
2460 	sfxge_txq_t *stp = sp->s_stp[index];
2461 	int rc;
2462 
2463 	ASSERT(mutex_owned(&(sp->s_state_lock)));
2464 	ASSERT(mutex_owned(&(sp->s_tx_flush_lock)));
2465 
2466 	mutex_enter(&(stp->st_lock));
2467 
2468 	/* Prepare to flush and stop the queue */
2469 	if (stp->st_state == SFXGE_TXQ_STARTED) {
2470 		/* Flush the transmit queue */
2471 		if ((rc = efx_tx_qflush(stp->st_etp)) == EALREADY) {
2472 			/* Already flushed, may be initiated by MC */
2473 			stp->st_state = SFXGE_TXQ_FLUSH_DONE;
2474 		} else if (rc != 0) {
2475 			/* Unexpected error */
2476 			stp->st_state = SFXGE_TXQ_FLUSH_FAILED;
2477 		} else if (wait_for_flush) {
2478 			stp->st_state = SFXGE_TXQ_FLUSH_PENDING;
2479 			sp->s_tx_flush_pending++;
2480 		} else {
2481 			/* Assume the flush is done */
2482 			stp->st_state = SFXGE_TXQ_FLUSH_DONE;
2483 		}
2484 	}
2485 
2486 	mutex_exit(&(stp->st_lock));
2487 }
2488 
2489 static void
2490 sfxge_tx_qstop(sfxge_t *sp, unsigned int index)
2491 {
2492 	sfxge_txq_t *stp = sp->s_stp[index];
2493 	unsigned int evq = stp->st_evq;
2494 	sfxge_evq_t *sep = sp->s_sep[evq];
2495 
2496 	mutex_enter(&(sep->se_lock));
2497 	mutex_enter(&(stp->st_lock));
2498 
2499 	if (stp->st_state == SFXGE_TXQ_INITIALIZED)
2500 		goto done;
2501 
2502 	ASSERT(stp->st_state == SFXGE_TXQ_FLUSH_PENDING ||
2503 	    stp->st_state == SFXGE_TXQ_FLUSH_DONE ||
2504 	    stp->st_state == SFXGE_TXQ_FLUSH_FAILED);
2505 
2506 	/* All queues should have been flushed */
2507 	if (stp->st_sp->s_tx_flush_pending != 0) {
2508 		dev_err(sp->s_dip, CE_NOTE,
2509 		    SFXGE_CMN_ERR "txq[%d] stop with flush_pending=%d",
2510 		    index, stp->st_sp->s_tx_flush_pending);
2511 	}
2512 	if (stp->st_state == SFXGE_TXQ_FLUSH_FAILED) {
2513 		dev_err(sp->s_dip, CE_NOTE,
2514 		    SFXGE_CMN_ERR "txq[%d] flush failed", index);
2515 	}
2516 
2517 	/* Destroy the transmit queue */
2518 	efx_tx_qdestroy(stp->st_etp);
2519 	stp->st_etp = NULL;
2520 
2521 	/* Clear entries from the buffer table */
2522 	sfxge_sram_buf_tbl_clear(sp, stp->st_id,
2523 	    EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));
2524 
2525 	sfxge_tx_qlist_abort(stp);
2526 	ASSERT3U(stp->st_n, ==, 0);
2527 
2528 	stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;
2529 
2530 	stp->st_pending = stp->st_added;
2531 
2532 	sfxge_tx_qcomplete(stp);
2533 	ASSERT3U(stp->st_completed, ==, stp->st_pending);
2534 
2535 	sfxge_tx_qreap(stp);
2536 	ASSERT3U(stp->st_reaped, ==, stp->st_completed);
2537 
2538 	/*
2539 	 * Ensure the deferred packet list is cleared
2540 	 * Can race with sfxge_tx_packet_add() adding to the put list
2541 	 */
2542 	sfxge_tx_qdpl_flush_locked(stp);
2543 
2544 	stp->st_added = 0;
2545 	stp->st_pending = 0;
2546 	stp->st_completed = 0;
2547 	stp->st_reaped = 0;
2548 
2549 	stp->st_state = SFXGE_TXQ_INITIALIZED;
2550 
2551 done:
2552 	mutex_exit(&(stp->st_lock));
2553 	mutex_exit(&(sep->se_lock));
2554 }
2555 
2556 static void
2557 sfxge_tx_qfini(sfxge_t *sp, unsigned int index)
2558 {
2559 	sfxge_txq_t *stp = sp->s_stp[index];
2560 	sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
2561 
2562 	ASSERT3U(stp->st_state, ==, SFXGE_TXQ_INITIALIZED);
2563 	stp->st_state = SFXGE_TXQ_UNINITIALIZED;
2564 
2565 	/* Detach the TXQ from the driver */
2566 	sp->s_stp[index] = NULL;
2567 	ASSERT(sp->s_tx_qcount > 0);
2568 	sp->s_tx_qcount--;
2569 
2570 	/* Free the EVQ label for events from this TXQ */
2571 	(void) sfxge_ev_txlabel_free(sp, stp->st_evq, stp, stp->st_label);
2572 	stp->st_label = 0;
2573 
2574 	/* Tear down the statistics */
2575 	sfxge_tx_kstat_fini(stp);
2576 
2577 	/* Ensure the deferred packet list is empty */
2578 	ASSERT3U(stdp->std_count, ==, 0);
2579 	ASSERT3P(stdp->std_get, ==, NULL);
2580 	ASSERT3U(stdp->std_put, ==, 0);
2581 
2582 	/* Clear the free buffer pool */
2583 	sfxge_tx_qfbp_empty(stp);
2584 
2585 	/* Clear the free mapping pool */
2586 	sfxge_tx_qfmp_empty(stp);
2587 
2588 	/* Clear the free packet pool */
2589 	sfxge_tx_qfpp_empty(stp);
2590 
2591 	mutex_destroy(&(stp->st_lock));
2592 
2593 	stp->st_evq = 0;
2594 	stp->st_type = 0;
2595 	stp->st_index = 0;
2596 
2597 	kmem_cache_free(sp->s_tqc, stp);
2598 }
2599 
2600 int
2601 sfxge_tx_init(sfxge_t *sp)
2602 {
2603 	sfxge_intr_t *sip = &(sp->s_intr);
2604 	char name[MAXNAMELEN];
2605 	sfxge_txq_type_t qtype;
2606 	unsigned int txq, evq;
2607 	int index;
2608 	int rc;
2609 
2610 	(void) snprintf(name, MAXNAMELEN - 1, "%s%d_tx_packet_cache",
2611 	    ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2612 
2613 	sp->s_tpc = kmem_cache_create(name, sizeof (sfxge_tx_packet_t),
2614 	    SFXGE_CPU_CACHE_SIZE, sfxge_tx_packet_ctor, sfxge_tx_packet_dtor,
2615 	    NULL, sp, NULL, 0);
2616 	ASSERT(sp->s_tpc != NULL);
2617 
2618 	(void) snprintf(name, MAXNAMELEN - 1, "%s%d_tx_buffer_cache",
2619 	    ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2620 
2621 	sp->s_tbc = kmem_cache_create(name, sizeof (sfxge_tx_buffer_t),
2622 	    SFXGE_CPU_CACHE_SIZE, sfxge_tx_buffer_ctor, sfxge_tx_buffer_dtor,
2623 	    NULL, sp, NULL, 0);
2624 	ASSERT(sp->s_tbc != NULL);
2625 
2626 	(void) snprintf(name, MAXNAMELEN - 1, "%s%d_tx_mapping_cache",
2627 	    ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2628 
2629 	sp->s_tmc = kmem_cache_create(name, sizeof (sfxge_tx_mapping_t),
2630 	    SFXGE_CPU_CACHE_SIZE, sfxge_tx_mapping_ctor, sfxge_tx_mapping_dtor,
2631 	    NULL, sp, NULL, 0);
2632 	ASSERT(sp->s_tmc != NULL);
2633 
2634 	(void) snprintf(name, MAXNAMELEN - 1, "%s%d_txq_cache",
2635 	    ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2636 
2637 	sp->s_tqc = kmem_cache_create(name, sizeof (sfxge_txq_t),
2638 	    SFXGE_CPU_CACHE_SIZE, sfxge_tx_qctor, sfxge_tx_qdtor, NULL, sp,
2639 	    NULL, 0);
2640 	ASSERT(sp->s_tqc != NULL);
2641 
2642 	/* Initialize the transmit queues. */
2643 	sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM]		= sip->si_nalloc;
2644 	sp->s_tx_scale_max[SFXGE_TXQ_IP_CKSUM]		= 1;
2645 	sp->s_tx_scale_max[SFXGE_TXQ_IP_TCP_UDP_CKSUM]	= sip->si_nalloc;
2646 
2647 	/* Ensure minimum queue counts required by sfxge_tx_packet_add(). */
2648 	if (sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM] < 1)
2649 		sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM] = 1;
2650 
2651 	if (sp->s_tx_scale_max[SFXGE_TXQ_IP_CKSUM] < 1)
2652 		sp->s_tx_scale_max[SFXGE_TXQ_IP_CKSUM] = 1;
2653 
2654 	txq = 0;
2655 	for (qtype = 0; qtype < SFXGE_TXQ_NTYPES; qtype++) {
2656 		unsigned int tx_scale = sp->s_tx_scale_max[qtype];
2657 
2658 		if (txq + tx_scale > EFX_ARRAY_SIZE(sp->s_stp)) {
2659 			rc = EINVAL;
2660 			goto fail1;
2661 		}
2662 
2663 		sp->s_tx_scale_base[qtype] = txq;
2664 
2665 		for (evq = 0; evq < tx_scale; evq++) {
2666 			if ((rc = sfxge_tx_qinit(sp, txq, qtype, evq)) != 0) {
2667 				goto fail2;
2668 			}
2669 			txq++;
2670 		}
2671 		ASSERT3U(txq, <=, EFX_ARRAY_SIZE(sp->s_stp));
2672 	}
2673 
2674 	return (0);
2675 
2676 fail2:
2677 	DTRACE_PROBE(fail2);
2678 
2679 fail1:
2680 	DTRACE_PROBE1(fail1, int, rc);
2681 
2682 	index = EFX_ARRAY_SIZE(sp->s_stp);
2683 	while (--index >= 0) {
2684 		if (sp->s_stp[index] != NULL)
2685 			sfxge_tx_qfini(sp, index);
2686 	}
2687 
2688 	kmem_cache_destroy(sp->s_tqc);
2689 	sp->s_tqc = NULL;
2690 
2691 	kmem_cache_destroy(sp->s_tmc);
2692 	sp->s_tmc = NULL;
2693 
2694 	kmem_cache_destroy(sp->s_tbc);
2695 	sp->s_tbc = NULL;
2696 
2697 	kmem_cache_destroy(sp->s_tpc);
2698 	sp->s_tpc = NULL;
2699 
2700 	return (rc);
2701 }
2702 
2703 int
2704 sfxge_tx_start(sfxge_t *sp)
2705 {
2706 	efx_nic_t *enp = sp->s_enp;
2707 	int index;
2708 	int rc;
2709 
2710 	/* Initialize the transmit module */
2711 	if ((rc = efx_tx_init(enp)) != 0)
2712 		goto fail1;
2713 
2714 	for (index = 0; index < EFX_ARRAY_SIZE(sp->s_stp); index++) {
2715 		if (sp->s_stp[index] != NULL)
2716 			if ((rc = sfxge_tx_qstart(sp, index)) != 0)
2717 				goto fail2;
2718 	}
2719 
2720 	return (0);
2721 
2722 fail2:
2723 	DTRACE_PROBE(fail2);
2724 
2725 	sfxge_tx_stop(sp);
2726 
2727 fail1:
2728 	DTRACE_PROBE1(fail1, int, rc);
2729 
2730 	return (rc);
2731 }
2732 
2733 
2734 /*
2735  * Add a packet to the TX Deferred Packet List and if the TX queue lock
2736  * can be acquired then call sfxge_tx_qdpl_service() to fragment and push
2737  * to the H/W transmit descriptor ring
2738  *
2739  * If ENOSPC is returned then the DPL is full or the packet create failed, but
2740  * the mblk isn't freed so that the caller can return this mblk from mc_tx() to
2741  * back-pressure the OS stack.
2742  *
2743  * For all other errors the mblk is freed
2744  */
2745 int
2746 sfxge_tx_packet_add(sfxge_t *sp, mblk_t *mp)
2747 {
2748 	struct ether_header *etherhp;
2749 	struct ip *iphp;
2750 	struct tcphdr *thp;
2751 	size_t off;
2752 	size_t size;
2753 	size_t mss;
2754 	sfxge_txq_t *stp;
2755 	unsigned int txq;
2756 	int index;
2757 	boolean_t locked;
2758 	sfxge_tx_packet_t *stpp;
2759 	sfxge_packet_type_t pkt_type;
2760 	uint16_t sport, dport;
2761 	int rc = 0;
2762 
2763 	ASSERT3P(mp->b_next, ==, NULL);
2764 	ASSERT(!(DB_CKSUMFLAGS(mp) & HCK_PARTIALCKSUM));
2765 
2766 	/*
2767 	 * Do not enqueue packets during startup/shutdown;
2768 	 *
2769 	 * NOTE: This access to the state is NOT protected by the state lock. It
2770 	 * is an imperfect test and anything further getting onto the get/put
2771 	 * deferred packet lists is cleaned up in (possibly repeated) calls to
2772 	 * sfxge_can_destroy().
2773 	 */
2774 	if (sp->s_state != SFXGE_STARTED) {
2775 		rc = EINVAL;
2776 		goto fail1;
2777 	}
2778 
2779 	etherhp = NULL;
2780 	iphp = NULL;
2781 	thp = NULL;
2782 	off = 0;
2783 	size = 0;
2784 	mss = 0;
2785 
2786 	/* Check whether we need the header pointers for LSO segmentation */
2787 	if (DB_LSOFLAGS(mp) & HW_LSO) {
2788 		/* LSO segmentation relies on hardware checksum offload */
2789 		DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
2790 
2791 		if ((mss = DB_LSOMSS(mp)) == 0) {
2792 			rc = EINVAL;
2793 			goto fail1;
2794 		}
2795 
2796 		pkt_type = sfxge_pkthdr_parse(mp, &etherhp, &iphp, &thp,
2797 		    &off, &size, &sport, &dport);
2798 
2799 		if (pkt_type != SFXGE_PACKET_TYPE_IPV4_TCP ||
2800 		    etherhp == NULL ||
2801 		    iphp == NULL ||
2802 		    thp == NULL ||
2803 		    off == 0) {
2804 			rc = EINVAL;
2805 			goto fail2;
2806 		}
2807 	}
2808 
2809 	/* Choose the appropriate transit queue */
2810 	if (DB_CKSUMFLAGS(mp) & HCK_FULLCKSUM) {
2811 		sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2812 
2813 		if (srsp->srs_state == SFXGE_RX_SCALE_STARTED) {
2814 			uint32_t hash;
2815 
2816 			if (srsp->srs_count > 1) {
2817 				/*
2818 				 * If we have not already parsed the headers
2819 				 * for LSO segmentation then we need to do it
2820 				 * now so we can calculate the hash.
2821 				 */
2822 				if (thp == NULL) {
2823 					(void) sfxge_pkthdr_parse(mp, &etherhp,
2824 					    &iphp, &thp, &off, &size,
2825 					    &sport, &dport);
2826 				}
2827 
2828 				if (thp != NULL) {
2829 					SFXGE_TCP_HASH(sp,
2830 					    &iphp->ip_dst.s_addr,
2831 					    thp->th_dport,
2832 					    &iphp->ip_src.s_addr,
2833 					    thp->th_sport, hash);
2834 
2835 					index = srsp->srs_tbl[hash %
2836 					    SFXGE_RX_SCALE_MAX];
2837 				} else if (iphp != NULL) {
2838 					/*
2839 					 * Calculate IPv4 4-tuple hash, with
2840 					 * TCP/UDP/SCTP src/dest ports. Ports
2841 					 * are zero for other IPv4 protocols.
2842 					 */
2843 					SFXGE_IP_HASH(sp,
2844 					    &iphp->ip_dst.s_addr, dport,
2845 					    &iphp->ip_src.s_addr, sport, hash);
2846 
2847 					index = srsp->srs_tbl[hash %
2848 					    SFXGE_RX_SCALE_MAX];
2849 				} else {
2850 					/*
2851 					 * Other traffic always goes to the
2852 					 * the queue in the zero-th entry of
2853 					 * the RSS table.
2854 					 */
2855 					index = srsp->srs_tbl[0];
2856 				}
2857 			} else {
2858 				/*
2859 				 * It does not matter what the hash is
2860 				 * because all the RSS table entries will be
2861 				 * the same.
2862 				 */
2863 				index = srsp->srs_tbl[0];
2864 			}
2865 
2866 			/*
2867 			 * Find the event queue corresponding to the hash in
2868 			 * the RSS table.
2869 			 */
2870 			txq = sp->s_tx_scale_base[SFXGE_TXQ_IP_TCP_UDP_CKSUM] +
2871 			    index;
2872 			stp = sp->s_stp[txq];
2873 			ASSERT3U(stp->st_evq, ==, index);
2874 		} else {
2875 			index = 0;
2876 			txq = sp->s_tx_scale_base[SFXGE_TXQ_IP_TCP_UDP_CKSUM] +
2877 			    index;
2878 			stp = sp->s_stp[txq];
2879 		}
2880 	} else if (DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) {
2881 		ASSERT3U(sp->s_tx_scale_max[SFXGE_TXQ_IP_CKSUM], >=, 1);
2882 		index = 0;
2883 		txq = sp->s_tx_scale_base[SFXGE_TXQ_IP_CKSUM] + index;
2884 		stp = sp->s_stp[txq];
2885 	} else {
2886 		/*
2887 		 * No hardware checksum offload requested.
2888 		 */
2889 		sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2890 
2891 		if (srsp->srs_state == SFXGE_RX_SCALE_STARTED) {
2892 			uint32_t hash = 0;
2893 
2894 			if (srsp->srs_count > 1) {
2895 				if (iphp == NULL) {
2896 					(void) sfxge_pkthdr_parse(mp, &etherhp,
2897 					    &iphp, &thp, &off, &size,
2898 					    &sport, &dport);
2899 				}
2900 
2901 				if (iphp != NULL) {
2902 					/*
2903 					 * Calculate IPv4 4-tuple hash, with
2904 					 * TCP/UDP/SCTP src/dest ports. Ports
2905 					 * are zero for other IPv4 protocols.
2906 					 */
2907 					SFXGE_IP_HASH(sp,
2908 					    &iphp->ip_dst.s_addr, dport,
2909 					    &iphp->ip_src.s_addr, sport, hash);
2910 
2911 					hash = hash % SFXGE_RX_SCALE_MAX;
2912 				}
2913 			}
2914 			index = srsp->srs_tbl[hash];
2915 
2916 			/*
2917 			 * The RSS table (indexed by hash) gives the RXQ index,
2918 			 * (mapped 1:1 with EVQs). Find the TXQ that results in
2919 			 * using the same EVQ as for the RX data path.
2920 			 */
2921 			ASSERT3U(sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM],
2922 			    >, index);
2923 			txq = sp->s_tx_scale_base[SFXGE_TXQ_NON_CKSUM] + index;
2924 			stp = sp->s_stp[txq];
2925 			ASSERT3U(stp->st_evq, ==, index);
2926 		} else {
2927 			ASSERT3U(sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM], >, 0);
2928 			index = 0;
2929 			txq = sp->s_tx_scale_base[SFXGE_TXQ_NON_CKSUM] + index;
2930 			stp = sp->s_stp[txq];
2931 		}
2932 
2933 
2934 	}
2935 	ASSERT(stp != NULL);
2936 
2937 	ASSERT(mss == 0 || (DB_LSOFLAGS(mp) & HW_LSO));
2938 
2939 	/* Try to grab the lock */
2940 	locked = mutex_tryenter(&(stp->st_lock));
2941 
2942 	if (locked) {
2943 		/* Try to grab a packet from the pool */
2944 		stpp = sfxge_tx_qfpp_get(stp);
2945 	} else {
2946 		stpp = NULL;
2947 	}
2948 
2949 	if (stpp == NULL) {
2950 		/*
2951 		 * Either the pool was empty or we don't have the lock so
2952 		 * allocate a new packet.
2953 		 */
2954 		if ((stpp = sfxge_tx_packet_create(sp)) == NULL) {
2955 			rc = ENOSPC;
2956 			goto fail3;
2957 		}
2958 	}
2959 
2960 	stpp->stp_mp = mp;
2961 	stpp->stp_etherhp = etherhp;
2962 	stpp->stp_iphp = iphp;
2963 	stpp->stp_thp = thp;
2964 	stpp->stp_off = off;
2965 	stpp->stp_size = size;
2966 	stpp->stp_mss = mss;
2967 	stpp->stp_dpl_put_len = 0;
2968 
2969 	rc = sfxge_tx_qdpl_add(stp, stpp, locked);
2970 	if (rc != 0) {
2971 		/* ENOSPC can happen for DPL get or put list is full */
2972 		ASSERT3U(rc, ==, ENOSPC);
2973 
2974 		/*
2975 		 * Note; if this is the unlocked DPL put list full case there is
2976 		 * no need to worry about a race with locked
2977 		 * sfxge_tx_qdpl_swizzle() as we know that the TX DPL put list
2978 		 * was full and would have been swizzle'd to the TX DPL get
2979 		 * list; hence guaranteeing future TX completions and calls
2980 		 * to mac_tx_update() via sfxge_tx_qcomplete()
2981 		 */
2982 		goto fail4;
2983 	}
2984 
2985 	/* Try to grab the lock again */
2986 	if (!locked)
2987 		locked = mutex_tryenter(&(stp->st_lock));
2988 
2989 	if (locked) {
2990 		/* Try to service the list */
2991 		sfxge_tx_qdpl_service(stp);
2992 		/* lock has been dropped */
2993 	}
2994 
2995 	return (0);
2996 
2997 fail4:
2998 	DTRACE_PROBE(fail4);
2999 	sfxge_tx_packet_destroy(sp, stpp);
3000 fail3:
3001 	DTRACE_PROBE(fail3);
3002 	if (locked)
3003 		mutex_exit(&(stp->st_lock));
3004 fail2:
3005 	DTRACE_PROBE(fail2);
3006 fail1:
3007 	DTRACE_PROBE1(fail1, int, rc);
3008 
3009 	if (rc != ENOSPC)
3010 		freemsg(mp);
3011 	return (rc);
3012 }
3013 
3014 void
3015 sfxge_tx_stop(sfxge_t *sp)
3016 {
3017 	efx_nic_t *enp = sp->s_enp;
3018 	clock_t timeout;
3019 	boolean_t wait_for_flush;
3020 	int index;
3021 
3022 	ASSERT(mutex_owned(&(sp->s_state_lock)));
3023 
3024 	mutex_enter(&(sp->s_tx_flush_lock));
3025 
3026 	/* Flush all the queues */
3027 	if (sp->s_hw_err == SFXGE_HW_OK) {
3028 		wait_for_flush = B_TRUE;
3029 	} else {
3030 		/*
3031 		 * Flag indicates possible hardware failure.
3032 		 * Attempt flush but do not wait for it to complete.
3033 		 */
3034 		wait_for_flush = B_FALSE;
3035 	}
3036 
3037 	/* Prepare queues to stop and flush the hardware ring */
3038 	index = EFX_ARRAY_SIZE(sp->s_stp);
3039 	while (--index >= 0) {
3040 		if (sp->s_stp[index] != NULL)
3041 			sfxge_tx_qflush(sp, index, wait_for_flush);
3042 	}
3043 
3044 	if (wait_for_flush == B_FALSE)
3045 		goto flush_done;
3046 
3047 	/* Wait upto 2sec for queue flushing to complete */
3048 	timeout = ddi_get_lbolt() + drv_usectohz(SFXGE_TX_QFLUSH_USEC);
3049 
3050 	while (sp->s_tx_flush_pending > 0) {
3051 		if (cv_timedwait(&(sp->s_tx_flush_kv), &(sp->s_tx_flush_lock),
3052 		    timeout) < 0) {
3053 			/* Timeout waiting for queues to flush */
3054 			dev_info_t *dip = sp->s_dip;
3055 
3056 			DTRACE_PROBE(timeout);
3057 			dev_err(dip, CE_NOTE,
3058 			    SFXGE_CMN_ERR "tx qflush timeout");
3059 			break;
3060 		}
3061 	}
3062 
3063 flush_done:
3064 	sp->s_tx_flush_pending = 0;
3065 	mutex_exit(&(sp->s_tx_flush_lock));
3066 
3067 	/* Stop all the queues */
3068 	index = EFX_ARRAY_SIZE(sp->s_stp);
3069 	while (--index >= 0) {
3070 		if (sp->s_stp[index] != NULL)
3071 			sfxge_tx_qstop(sp, index);
3072 	}
3073 
3074 	/* Tear down the transmit module */
3075 	efx_tx_fini(enp);
3076 }
3077 
3078 void
3079 sfxge_tx_fini(sfxge_t *sp)
3080 {
3081 	int index;
3082 
3083 	index = EFX_ARRAY_SIZE(sp->s_stp);
3084 	while (--index >= 0) {
3085 		if (sp->s_stp[index] != NULL)
3086 			sfxge_tx_qfini(sp, index);
3087 	}
3088 
3089 	kmem_cache_destroy(sp->s_tqc);
3090 	sp->s_tqc = NULL;
3091 
3092 	kmem_cache_destroy(sp->s_tmc);
3093 	sp->s_tmc = NULL;
3094 
3095 	kmem_cache_destroy(sp->s_tbc);
3096 	sp->s_tbc = NULL;
3097 
3098 	kmem_cache_destroy(sp->s_tpc);
3099 	sp->s_tpc = NULL;
3100 }
3101