xref: /titanic_51/usr/src/uts/common/io/nxge/nxge_send.c (revision ddece0baf7ff3a228bcd106c2bb2303ac0c9af89)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/nxge/nxge_impl.h>
29 
30 extern uint32_t		nxge_reclaim_pending;
31 extern uint32_t 	nxge_bcopy_thresh;
32 extern uint32_t 	nxge_dvma_thresh;
33 extern uint32_t 	nxge_dma_stream_thresh;
34 extern uint32_t		nxge_tx_minfree;
35 extern uint32_t		nxge_tx_intr_thres;
36 extern uint32_t		nxge_tx_max_gathers;
37 extern uint32_t		nxge_tx_tiny_pack;
38 extern uint32_t		nxge_tx_use_bcopy;
39 extern uint32_t		nxge_tx_lb_policy;
40 extern uint32_t		nxge_no_tx_lb;
41 
42 typedef struct _mac_tx_hint {
43 	uint16_t	sap;
44 	uint16_t	vid;
45 	void		*hash;
46 } mac_tx_hint_t, *p_mac_tx_hint_t;
47 
48 int nxge_tx_lb_ring_1(p_mblk_t, uint32_t, p_mac_tx_hint_t);
49 
50 int
51 nxge_start(p_nxge_t nxgep, p_tx_ring_t tx_ring_p, p_mblk_t mp)
52 {
53 	int 			status = 0;
54 	p_tx_desc_t 		tx_desc_ring_vp;
55 	npi_handle_t		npi_desc_handle;
56 	nxge_os_dma_handle_t 	tx_desc_dma_handle;
57 	p_tx_desc_t 		tx_desc_p;
58 	p_tx_msg_t 		tx_msg_ring;
59 	p_tx_msg_t 		tx_msg_p;
60 	tx_desc_t		tx_desc, *tmp_desc_p;
61 	tx_desc_t		sop_tx_desc, *sop_tx_desc_p;
62 	p_tx_pkt_header_t	hdrp;
63 	p_tx_pkt_hdr_all_t	pkthdrp;
64 	uint8_t			npads = 0;
65 	uint64_t 		dma_ioaddr;
66 	uint32_t		dma_flags;
67 	int			last_bidx;
68 	uint8_t 		*b_rptr;
69 	caddr_t 		kaddr;
70 	uint32_t		nmblks;
71 	uint32_t		ngathers;
72 	uint32_t		clen;
73 	int 			len;
74 	uint32_t		pkt_len, pack_len, min_len;
75 	uint32_t		bcopy_thresh;
76 	int 			i, cur_index, sop_index;
77 	uint16_t		tail_index;
78 	boolean_t		tail_wrap = B_FALSE;
79 	nxge_dma_common_t	desc_area;
80 	nxge_os_dma_handle_t 	dma_handle;
81 	ddi_dma_cookie_t 	dma_cookie;
82 	npi_handle_t		npi_handle;
83 	p_mblk_t 		nmp;
84 	p_mblk_t		t_mp;
85 	uint32_t 		ncookies;
86 	boolean_t 		good_packet;
87 	boolean_t 		mark_mode = B_FALSE;
88 	p_nxge_stats_t 		statsp;
89 	p_nxge_tx_ring_stats_t tdc_stats;
90 	t_uscalar_t 		start_offset = 0;
91 	t_uscalar_t 		stuff_offset = 0;
92 	t_uscalar_t 		end_offset = 0;
93 	t_uscalar_t 		value = 0;
94 	t_uscalar_t 		cksum_flags = 0;
95 	boolean_t		cksum_on = B_FALSE;
96 	uint32_t		boff = 0;
97 	uint64_t		tot_xfer_len = 0, tmp_len = 0;
98 	boolean_t		header_set = B_FALSE;
99 #ifdef NXGE_DEBUG
100 	p_tx_desc_t 		tx_desc_ring_pp;
101 	p_tx_desc_t 		tx_desc_pp;
102 	tx_desc_t		*save_desc_p;
103 	int			dump_len;
104 	int			sad_len;
105 	uint64_t		sad;
106 	int			xfer_len;
107 	uint32_t		msgsize;
108 #endif
109 
110 	NXGE_DEBUG_MSG((nxgep, TX_CTL,
111 		"==> nxge_start: tx dma channel %d", tx_ring_p->tdc));
112 	NXGE_DEBUG_MSG((nxgep, TX_CTL,
113 		"==> nxge_start: Starting tdc %d desc pending %d",
114 		tx_ring_p->tdc, tx_ring_p->descs_pending));
115 
116 	statsp = nxgep->statsp;
117 
118 	if (nxgep->statsp->port_stats.lb_mode == nxge_lb_normal) {
119 		if (!statsp->mac_stats.link_up) {
120 			freemsg(mp);
121 			NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start: "
122 				"link not up or LB mode"));
123 			goto nxge_start_fail1;
124 		}
125 	}
126 
127 	hcksum_retrieve(mp, NULL, NULL, &start_offset,
128 		&stuff_offset, &end_offset, &value, &cksum_flags);
129 	if (!NXGE_IS_VLAN_PACKET(mp->b_rptr)) {
130 		start_offset += sizeof (ether_header_t);
131 		stuff_offset += sizeof (ether_header_t);
132 	} else {
133 		start_offset += sizeof (struct ether_vlan_header);
134 		stuff_offset += sizeof (struct ether_vlan_header);
135 	}
136 
137 	if (cksum_flags & HCK_PARTIALCKSUM) {
138 		NXGE_DEBUG_MSG((nxgep, TX_CTL,
139 			"==> nxge_start: cksum_flags 0x%x (partial checksum) ",
140 			cksum_flags));
141 		cksum_on = B_TRUE;
142 	}
143 
144 #ifdef	NXGE_DEBUG
145 	if (tx_ring_p->descs_pending) {
146 		NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start: "
147 			"desc pending %d ", tx_ring_p->descs_pending));
148 	}
149 
150 	dump_len = (int)(MBLKL(mp));
151 	dump_len = (dump_len > 128) ? 128: dump_len;
152 
153 	NXGE_DEBUG_MSG((nxgep, TX_CTL,
154 		"==> nxge_start: tdc %d: dumping ...: b_rptr $%p "
155 		"(Before header reserve: ORIGINAL LEN %d)",
156 		tx_ring_p->tdc,
157 		mp->b_rptr,
158 		dump_len));
159 
160 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start: dump packets "
161 		"(IP ORIGINAL b_rptr $%p): %s", mp->b_rptr,
162 		nxge_dump_packet((char *)mp->b_rptr, dump_len)));
163 #endif
164 
165 	MUTEX_ENTER(&tx_ring_p->lock);
166 	tdc_stats = tx_ring_p->tdc_stats;
167 	mark_mode = (tx_ring_p->descs_pending &&
168 		((tx_ring_p->tx_ring_size - tx_ring_p->descs_pending)
169 		< nxge_tx_minfree));
170 
171 	NXGE_DEBUG_MSG((nxgep, TX_CTL,
172 		"TX Descriptor ring is channel %d mark mode %d",
173 		tx_ring_p->tdc, mark_mode));
174 
175 	if (!nxge_txdma_reclaim(nxgep, tx_ring_p, nxge_tx_minfree)) {
176 		NXGE_DEBUG_MSG((nxgep, TX_CTL,
177 			"TX Descriptor ring is full: channel %d",
178 			tx_ring_p->tdc));
179 		cas32((uint32_t *)&tx_ring_p->queueing, 0, 1);
180 		tdc_stats->tx_no_desc++;
181 		MUTEX_EXIT(&tx_ring_p->lock);
182 		if (nxgep->resched_needed && !nxgep->resched_running) {
183 			nxgep->resched_running = B_TRUE;
184 			ddi_trigger_softintr(nxgep->resched_id);
185 		}
186 		status = 1;
187 		goto nxge_start_fail1;
188 	}
189 
190 	nmp = mp;
191 	i = sop_index = tx_ring_p->wr_index;
192 	nmblks = 0;
193 	ngathers = 0;
194 	pkt_len = 0;
195 	pack_len = 0;
196 	clen = 0;
197 	last_bidx = -1;
198 	good_packet = B_TRUE;
199 
200 	desc_area = tx_ring_p->tdc_desc;
201 	npi_handle = desc_area.npi_handle;
202 	npi_desc_handle.regh = (nxge_os_acc_handle_t)
203 			DMA_COMMON_ACC_HANDLE(desc_area);
204 	tx_desc_ring_vp = (p_tx_desc_t)DMA_COMMON_VPTR(desc_area);
205 #ifdef	NXGE_DEBUG
206 	tx_desc_ring_pp = (p_tx_desc_t)DMA_COMMON_IOADDR(desc_area);
207 #endif
208 	tx_desc_dma_handle = (nxge_os_dma_handle_t)
209 			DMA_COMMON_HANDLE(desc_area);
210 	tx_msg_ring = tx_ring_p->tx_msg_ring;
211 
212 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start: wr_index %d i %d",
213 		sop_index, i));
214 
215 #ifdef	NXGE_DEBUG
216 	msgsize = msgdsize(nmp);
217 	NXGE_DEBUG_MSG((nxgep, TX_CTL,
218 		"==> nxge_start(1): wr_index %d i %d msgdsize %d",
219 		sop_index, i, msgsize));
220 #endif
221 	/*
222 	 * The first 16 bytes of the premapped buffer are reserved
223 	 * for header. No padding will be used.
224 	 */
225 	pkt_len = pack_len = boff = TX_PKT_HEADER_SIZE;
226 	if (nxge_tx_use_bcopy) {
227 		bcopy_thresh = (nxge_bcopy_thresh - TX_PKT_HEADER_SIZE);
228 	} else {
229 		bcopy_thresh = (TX_BCOPY_SIZE - TX_PKT_HEADER_SIZE);
230 	}
231 	while (nmp) {
232 		good_packet = B_TRUE;
233 		b_rptr = nmp->b_rptr;
234 		len = MBLKL(nmp);
235 		if (len <= 0) {
236 			nmp = nmp->b_cont;
237 			continue;
238 		}
239 		nmblks++;
240 
241 		NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start(1): nmblks %d "
242 			"len %d pkt_len %d pack_len %d",
243 			nmblks, len, pkt_len, pack_len));
244 		/*
245 		 * Hardware limits the transfer length to 4K for NIU and
246 		 * 4076 (TX_MAX_TRANSFER_LENGTH) for Neptune. But we just
247 		 * use TX_MAX_TRANSFER_LENGTH as the limit for both.
248 		 * If len is longer than the limit, then we break nmp into
249 		 * two chunks: Make the first chunk equal to the limit and
250 		 * the second chunk for the remaining data. If the second
251 		 * chunk is still larger than the limit, then it will be
252 		 * broken into two in the next pass.
253 		 */
254 		if (len > TX_MAX_TRANSFER_LENGTH - TX_PKT_HEADER_SIZE) {
255 			t_mp = dupb(nmp);
256 			nmp->b_wptr = nmp->b_rptr +
257 				(TX_MAX_TRANSFER_LENGTH - TX_PKT_HEADER_SIZE);
258 			t_mp->b_rptr = nmp->b_wptr;
259 			t_mp->b_cont = nmp->b_cont;
260 			nmp->b_cont = t_mp;
261 			len = MBLKL(nmp);
262 		}
263 
264 		tx_desc.value = 0;
265 		tx_desc_p = &tx_desc_ring_vp[i];
266 #ifdef	NXGE_DEBUG
267 		tx_desc_pp = &tx_desc_ring_pp[i];
268 #endif
269 		tx_msg_p = &tx_msg_ring[i];
270 		npi_desc_handle.regp = (uint64_t)tx_desc_p;
271 		if (!header_set &&
272 			((!nxge_tx_use_bcopy && (len > TX_BCOPY_SIZE)) ||
273 				(len >= bcopy_thresh))) {
274 			header_set = B_TRUE;
275 			bcopy_thresh += TX_PKT_HEADER_SIZE;
276 			boff = 0;
277 			pack_len = 0;
278 			kaddr = (caddr_t)DMA_COMMON_VPTR(tx_msg_p->buf_dma);
279 			hdrp = (p_tx_pkt_header_t)kaddr;
280 			clen = pkt_len;
281 			dma_handle = tx_msg_p->buf_dma_handle;
282 			dma_ioaddr = DMA_COMMON_IOADDR(tx_msg_p->buf_dma);
283 			(void) ddi_dma_sync(dma_handle,
284 				i * nxge_bcopy_thresh, nxge_bcopy_thresh,
285 				DDI_DMA_SYNC_FORDEV);
286 
287 			tx_msg_p->flags.dma_type = USE_BCOPY;
288 			goto nxge_start_control_header_only;
289 		}
290 
291 		pkt_len += len;
292 		pack_len += len;
293 
294 		NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start(3): "
295 			"desc entry %d "
296 			"DESC IOADDR $%p "
297 			"desc_vp $%p tx_desc_p $%p "
298 			"desc_pp $%p tx_desc_pp $%p "
299 			"len %d pkt_len %d pack_len %d",
300 			i,
301 			DMA_COMMON_IOADDR(desc_area),
302 			tx_desc_ring_vp, tx_desc_p,
303 			tx_desc_ring_pp, tx_desc_pp,
304 			len, pkt_len, pack_len));
305 
306 		if (len < bcopy_thresh) {
307 			NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start(4): "
308 				"USE BCOPY: "));
309 			if (nxge_tx_tiny_pack) {
310 				uint32_t blst =
311 					TXDMA_DESC_NEXT_INDEX(i, -1,
312 						tx_ring_p->tx_wrap_mask);
313 				NXGE_DEBUG_MSG((nxgep, TX_CTL,
314 					"==> nxge_start(5): pack"));
315 				if ((pack_len <= bcopy_thresh) &&
316 					(last_bidx == blst)) {
317 					NXGE_DEBUG_MSG((nxgep, TX_CTL,
318 						"==> nxge_start: pack(6) "
319 						"(pkt_len %d pack_len %d)",
320 						pkt_len, pack_len));
321 					i = blst;
322 					tx_desc_p = &tx_desc_ring_vp[i];
323 #ifdef	NXGE_DEBUG
324 					tx_desc_pp = &tx_desc_ring_pp[i];
325 #endif
326 					tx_msg_p = &tx_msg_ring[i];
327 					boff = pack_len - len;
328 					ngathers--;
329 				} else if (pack_len > bcopy_thresh &&
330 					header_set) {
331 					pack_len = len;
332 					boff = 0;
333 					bcopy_thresh = nxge_bcopy_thresh;
334 					NXGE_DEBUG_MSG((nxgep, TX_CTL,
335 						"==> nxge_start(7): > max NEW "
336 						"bcopy thresh %d "
337 						"pkt_len %d pack_len %d(next)",
338 						bcopy_thresh,
339 						pkt_len, pack_len));
340 				}
341 				last_bidx = i;
342 			}
343 			kaddr = (caddr_t)DMA_COMMON_VPTR(tx_msg_p->buf_dma);
344 			if ((boff == TX_PKT_HEADER_SIZE) && (nmblks == 1)) {
345 				hdrp = (p_tx_pkt_header_t)kaddr;
346 				header_set = B_TRUE;
347 				NXGE_DEBUG_MSG((nxgep, TX_CTL,
348 					"==> nxge_start(7_x2): "
349 					"pkt_len %d pack_len %d (new hdrp $%p)",
350 					pkt_len, pack_len, hdrp));
351 			}
352 			tx_msg_p->flags.dma_type = USE_BCOPY;
353 			kaddr += boff;
354 			NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start(8): "
355 				"USE BCOPY: before bcopy "
356 				"DESC IOADDR $%p entry %d "
357 				"bcopy packets %d "
358 				"bcopy kaddr $%p "
359 				"bcopy ioaddr (SAD) $%p "
360 				"bcopy clen %d "
361 				"bcopy boff %d",
362 				DMA_COMMON_IOADDR(desc_area), i,
363 				tdc_stats->tx_hdr_pkts,
364 				kaddr,
365 				dma_ioaddr,
366 				clen,
367 				boff));
368 			NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start: "
369 				"1USE BCOPY: "));
370 			NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start: "
371 				"2USE BCOPY: "));
372 			NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start: "
373 				"last USE BCOPY: copy from b_rptr $%p "
374 				"to KADDR $%p (len %d offset %d",
375 				b_rptr, kaddr, len, boff));
376 
377 			bcopy(b_rptr, kaddr, len);
378 
379 #ifdef	NXGE_DEBUG
380 			dump_len = (len > 128) ? 128: len;
381 			NXGE_DEBUG_MSG((nxgep, TX_CTL,
382 				"==> nxge_start: dump packets "
383 				"(After BCOPY len %d)"
384 				"(b_rptr $%p): %s", len, nmp->b_rptr,
385 				nxge_dump_packet((char *)nmp->b_rptr,
386 				dump_len)));
387 #endif
388 
389 			dma_handle = tx_msg_p->buf_dma_handle;
390 			dma_ioaddr = DMA_COMMON_IOADDR(tx_msg_p->buf_dma);
391 			(void) ddi_dma_sync(dma_handle,
392 				i * nxge_bcopy_thresh, nxge_bcopy_thresh,
393 					DDI_DMA_SYNC_FORDEV);
394 			clen = len + boff;
395 			tdc_stats->tx_hdr_pkts++;
396 			NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start(9): "
397 				"USE BCOPY: "
398 				"DESC IOADDR $%p entry %d "
399 				"bcopy packets %d "
400 				"bcopy kaddr $%p "
401 				"bcopy ioaddr (SAD) $%p "
402 				"bcopy clen %d "
403 				"bcopy boff %d",
404 				DMA_COMMON_IOADDR(desc_area),
405 				i,
406 				tdc_stats->tx_hdr_pkts,
407 				kaddr,
408 				dma_ioaddr,
409 				clen,
410 				boff));
411 		} else {
412 			NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start(12): "
413 				"USE DVMA: len %d", len));
414 			tx_msg_p->flags.dma_type = USE_DMA;
415 			dma_flags = DDI_DMA_WRITE;
416 			if (len < nxge_dma_stream_thresh) {
417 				dma_flags |= DDI_DMA_CONSISTENT;
418 			} else {
419 				dma_flags |= DDI_DMA_STREAMING;
420 			}
421 
422 			dma_handle = tx_msg_p->dma_handle;
423 			status = ddi_dma_addr_bind_handle(dma_handle, NULL,
424 				(caddr_t)b_rptr, len, dma_flags,
425 				DDI_DMA_DONTWAIT, NULL,
426 				&dma_cookie, &ncookies);
427 			if (status == DDI_DMA_MAPPED) {
428 				dma_ioaddr = dma_cookie.dmac_laddress;
429 				len = (int)dma_cookie.dmac_size;
430 				clen = (uint32_t)dma_cookie.dmac_size;
431 				NXGE_DEBUG_MSG((nxgep, TX_CTL,
432 					"==> nxge_start(12_1): "
433 					"USE DVMA: len %d clen %d "
434 					"ngathers %d",
435 					len, clen,
436 					ngathers));
437 
438 				npi_desc_handle.regp = (uint64_t)tx_desc_p;
439 				while (ncookies > 1) {
440 					ngathers++;
441 					/*
442 					 * this is the fix for multiple
443 					 * cookies, which are basicaly
444 					 * a descriptor entry, we don't set
445 					 * SOP bit as well as related fields
446 					 */
447 
448 					(void) npi_txdma_desc_gather_set(
449 						npi_desc_handle,
450 						&tx_desc,
451 						(ngathers -1),
452 						mark_mode,
453 						ngathers,
454 						dma_ioaddr,
455 						clen);
456 
457 					tx_msg_p->tx_msg_size = clen;
458 					NXGE_DEBUG_MSG((nxgep, TX_CTL,
459 						"==> nxge_start:  DMA "
460 						"ncookie %d "
461 						"ngathers %d "
462 						"dma_ioaddr $%p len %d"
463 						"desc $%p descp $%p (%d)",
464 						ncookies,
465 						ngathers,
466 						dma_ioaddr, clen,
467 						*tx_desc_p, tx_desc_p, i));
468 
469 					ddi_dma_nextcookie(dma_handle,
470 							&dma_cookie);
471 					dma_ioaddr =
472 						dma_cookie.dmac_laddress;
473 
474 					len = (int)dma_cookie.dmac_size;
475 					clen = (uint32_t)dma_cookie.dmac_size;
476 					NXGE_DEBUG_MSG((nxgep, TX_CTL,
477 						"==> nxge_start(12_2): "
478 						"USE DVMA: len %d clen %d ",
479 						len, clen));
480 
481 					i = TXDMA_DESC_NEXT_INDEX(i, 1,
482 						tx_ring_p->tx_wrap_mask);
483 					tx_desc_p = &tx_desc_ring_vp[i];
484 
485 					npi_desc_handle.regp =
486 						(uint64_t)tx_desc_p;
487 					tx_msg_p = &tx_msg_ring[i];
488 					tx_msg_p->flags.dma_type = USE_NONE;
489 					tx_desc.value = 0;
490 
491 					ncookies--;
492 				}
493 				tdc_stats->tx_ddi_pkts++;
494 				NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start:"
495 					"DMA: ddi packets %d",
496 					tdc_stats->tx_ddi_pkts));
497 			} else {
498 				NXGE_ERROR_MSG((nxgep, NXGE_ERR_CTL,
499 				    "dma mapping failed for %d "
500 				    "bytes addr $%p flags %x (%d)",
501 				    len, b_rptr, status, status));
502 				good_packet = B_FALSE;
503 				tdc_stats->tx_dma_bind_fail++;
504 				tx_msg_p->flags.dma_type = USE_NONE;
505 				goto nxge_start_fail2;
506 			}
507 		} /* ddi dvma */
508 
509 		nmp = nmp->b_cont;
510 nxge_start_control_header_only:
511 		npi_desc_handle.regp = (uint64_t)tx_desc_p;
512 		ngathers++;
513 
514 		if (ngathers == 1) {
515 #ifdef	NXGE_DEBUG
516 			save_desc_p = &sop_tx_desc;
517 #endif
518 			sop_tx_desc_p = &sop_tx_desc;
519 			sop_tx_desc_p->value = 0;
520 			sop_tx_desc_p->bits.hdw.tr_len = clen;
521 			sop_tx_desc_p->bits.hdw.sad = dma_ioaddr >> 32;
522 			sop_tx_desc_p->bits.ldw.sad = dma_ioaddr & 0xffffffff;
523 		} else {
524 #ifdef	NXGE_DEBUG
525 			save_desc_p = &tx_desc;
526 #endif
527 			tmp_desc_p = &tx_desc;
528 			tmp_desc_p->value = 0;
529 			tmp_desc_p->bits.hdw.tr_len = clen;
530 			tmp_desc_p->bits.hdw.sad = dma_ioaddr >> 32;
531 			tmp_desc_p->bits.ldw.sad = dma_ioaddr & 0xffffffff;
532 
533 			tx_desc_p->value = tmp_desc_p->value;
534 		}
535 
536 		NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start(13): "
537 			"Desc_entry %d ngathers %d "
538 			"desc_vp $%p tx_desc_p $%p "
539 			"len %d clen %d pkt_len %d pack_len %d nmblks %d "
540 			"dma_ioaddr (SAD) $%p mark %d",
541 			i, ngathers,
542 			tx_desc_ring_vp, tx_desc_p,
543 			len, clen, pkt_len, pack_len, nmblks,
544 			dma_ioaddr, mark_mode));
545 
546 #ifdef NXGE_DEBUG
547 		npi_desc_handle.nxgep = nxgep;
548 		npi_desc_handle.function.function = nxgep->function_num;
549 		npi_desc_handle.function.instance = nxgep->instance;
550 		sad = (save_desc_p->value & TX_PKT_DESC_SAD_MASK);
551 		xfer_len = ((save_desc_p->value & TX_PKT_DESC_TR_LEN_MASK) >>
552 			TX_PKT_DESC_TR_LEN_SHIFT);
553 
554 
555 		NXGE_DEBUG_MSG((nxgep, TX_CTL, "\n\t: value 0x%llx\n"
556 			"\t\tsad $%p\ttr_len %d len %d\tnptrs %d\t"
557 			"mark %d sop %d\n",
558 			save_desc_p->value,
559 			sad,
560 			save_desc_p->bits.hdw.tr_len,
561 			xfer_len,
562 			save_desc_p->bits.hdw.num_ptr,
563 			save_desc_p->bits.hdw.mark,
564 			save_desc_p->bits.hdw.sop));
565 
566 		npi_txdma_dump_desc_one(npi_desc_handle, NULL, i);
567 #endif
568 
569 		tx_msg_p->tx_msg_size = clen;
570 		i = TXDMA_DESC_NEXT_INDEX(i, 1, tx_ring_p->tx_wrap_mask);
571 		if (ngathers > nxge_tx_max_gathers) {
572 			good_packet = B_FALSE;
573 			hcksum_retrieve(mp, NULL, NULL, &start_offset,
574 				&stuff_offset, &end_offset, &value,
575 				&cksum_flags);
576 
577 			NXGE_DEBUG_MSG((NULL, TX_CTL,
578 				"==> nxge_start(14): pull msg - "
579 				"len %d pkt_len %d ngathers %d",
580 				len, pkt_len, ngathers));
581 			/* Pull all message blocks from b_cont */
582 			if ((msgpullup(mp, -1)) == NULL) {
583 				goto nxge_start_fail2;
584 			}
585 			goto nxge_start_fail2;
586 		}
587 	} /* while (nmp) */
588 
589 	tx_msg_p->tx_message = mp;
590 	tx_desc_p = &tx_desc_ring_vp[sop_index];
591 	npi_desc_handle.regp = (uint64_t)tx_desc_p;
592 
593 	pkthdrp = (p_tx_pkt_hdr_all_t)hdrp;
594 	pkthdrp->reserved = 0;
595 	hdrp->value = 0;
596 	(void) nxge_fill_tx_hdr(mp, B_FALSE, cksum_on,
597 		(pkt_len - TX_PKT_HEADER_SIZE), npads, pkthdrp);
598 
599 	if (pkt_len > NXGE_MTU_DEFAULT_MAX) {
600 		tdc_stats->tx_jumbo_pkts++;
601 	}
602 
603 	min_len = (nxgep->msg_min + TX_PKT_HEADER_SIZE + (npads * 2));
604 	if (pkt_len < min_len) {
605 		/* Assume we use bcopy to premapped buffers */
606 		kaddr = (caddr_t)DMA_COMMON_VPTR(tx_msg_p->buf_dma);
607 		NXGE_DEBUG_MSG((NULL, TX_CTL,
608 			"==> nxge_start(14-1): < (msg_min + 16)"
609 			"len %d pkt_len %d min_len %d bzero %d ngathers %d",
610 			len, pkt_len, min_len, (min_len - pkt_len), ngathers));
611 		bzero((kaddr + pkt_len), (min_len - pkt_len));
612 		pkt_len = tx_msg_p->tx_msg_size = min_len;
613 
614 		sop_tx_desc_p->bits.hdw.tr_len = min_len;
615 
616 		NXGE_MEM_PIO_WRITE64(npi_desc_handle, sop_tx_desc_p->value);
617 		tx_desc_p->value = sop_tx_desc_p->value;
618 
619 		NXGE_DEBUG_MSG((NULL, TX_CTL,
620 			"==> nxge_start(14-2): < msg_min - "
621 			"len %d pkt_len %d min_len %d ngathers %d",
622 			len, pkt_len, min_len, ngathers));
623 	}
624 
625 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start: cksum_flags 0x%x ",
626 		cksum_flags));
627 	if (cksum_flags & HCK_PARTIALCKSUM) {
628 		NXGE_DEBUG_MSG((nxgep, TX_CTL,
629 			"==> nxge_start: cksum_flags 0x%x (partial checksum) ",
630 			cksum_flags));
631 		cksum_on = B_TRUE;
632 		NXGE_DEBUG_MSG((nxgep, TX_CTL,
633 			"==> nxge_start: from IP cksum_flags 0x%x "
634 			"(partial checksum) "
635 			"start_offset %d stuff_offset %d",
636 			cksum_flags, start_offset, stuff_offset));
637 		tmp_len = (uint64_t)(start_offset >> 1);
638 		hdrp->value |= (tmp_len << TX_PKT_HEADER_L4START_SHIFT);
639 		tmp_len = (uint64_t)(stuff_offset >> 1);
640 		hdrp->value |= (tmp_len << TX_PKT_HEADER_L4STUFF_SHIFT);
641 
642 		NXGE_DEBUG_MSG((nxgep, TX_CTL,
643 			"==> nxge_start: from IP cksum_flags 0x%x "
644 			"(partial checksum) "
645 			"after SHIFT start_offset %d stuff_offset %d",
646 			cksum_flags, start_offset, stuff_offset));
647 	}
648 	{
649 		uint64_t	tmp_len;
650 
651 		/* pkt_len already includes 16 + paddings!! */
652 		/* Update the control header length */
653 		tot_xfer_len = (pkt_len - TX_PKT_HEADER_SIZE);
654 		tmp_len = hdrp->value |
655 			(tot_xfer_len << TX_PKT_HEADER_TOT_XFER_LEN_SHIFT);
656 
657 		NXGE_DEBUG_MSG((nxgep, TX_CTL,
658 			"==> nxge_start(15_x1): setting SOP "
659 			"tot_xfer_len 0x%llx (%d) pkt_len %d tmp_len "
660 			"0x%llx hdrp->value 0x%llx",
661 			tot_xfer_len, tot_xfer_len, pkt_len,
662 			tmp_len, hdrp->value));
663 #if defined(_BIG_ENDIAN)
664 		hdrp->value = ddi_swap64(tmp_len);
665 #else
666 		hdrp->value = tmp_len;
667 #endif
668 		NXGE_DEBUG_MSG((nxgep,
669 			TX_CTL, "==> nxge_start(15_x2): setting SOP "
670 			"after SWAP: tot_xfer_len 0x%llx pkt_len %d "
671 			"tmp_len 0x%llx hdrp->value 0x%llx",
672 			tot_xfer_len, pkt_len,
673 			tmp_len, hdrp->value));
674 	}
675 
676 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start(15): setting SOP "
677 		"wr_index %d "
678 		"tot_xfer_len (%d) pkt_len %d npads %d",
679 		sop_index,
680 		tot_xfer_len, pkt_len,
681 		npads));
682 
683 	sop_tx_desc_p->bits.hdw.sop = 1;
684 	sop_tx_desc_p->bits.hdw.mark = mark_mode;
685 	sop_tx_desc_p->bits.hdw.num_ptr = ngathers;
686 
687 	NXGE_MEM_PIO_WRITE64(npi_desc_handle, sop_tx_desc_p->value);
688 
689 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start(16): set SOP done"));
690 
691 #ifdef NXGE_DEBUG
692 	npi_desc_handle.nxgep = nxgep;
693 	npi_desc_handle.function.function = nxgep->function_num;
694 	npi_desc_handle.function.instance = nxgep->instance;
695 
696 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "\n\t: value 0x%llx\n"
697 		"\t\tsad $%p\ttr_len %d len %d\tnptrs %d\tmark %d sop %d\n",
698 		save_desc_p->value,
699 		sad,
700 		save_desc_p->bits.hdw.tr_len,
701 		xfer_len,
702 		save_desc_p->bits.hdw.num_ptr,
703 		save_desc_p->bits.hdw.mark,
704 		save_desc_p->bits.hdw.sop));
705 	(void) npi_txdma_dump_desc_one(npi_desc_handle, NULL, sop_index);
706 
707 	dump_len = (pkt_len > 128) ? 128: pkt_len;
708 	NXGE_DEBUG_MSG((nxgep, TX_CTL,
709 		"==> nxge_start: dump packets(17) (after sop set, len "
710 		" (len/dump_len/pkt_len/tot_xfer_len) %d/%d/%d/%d):\n"
711 		"ptr $%p: %s", len, dump_len, pkt_len, tot_xfer_len,
712 		(char *)hdrp,
713 		nxge_dump_packet((char *)hdrp, dump_len)));
714 	NXGE_DEBUG_MSG((nxgep, TX_CTL,
715 		"==> nxge_start(18): TX desc sync: sop_index %d",
716 			sop_index));
717 #endif
718 
719 	if ((ngathers == 1) || tx_ring_p->wr_index < i) {
720 		(void) ddi_dma_sync(tx_desc_dma_handle,
721 			sop_index * sizeof (tx_desc_t),
722 			ngathers * sizeof (tx_desc_t),
723 			DDI_DMA_SYNC_FORDEV);
724 
725 		NXGE_DEBUG_MSG((nxgep, TX_CTL, "nxge_start(19): sync 1 "
726 			"cs_off = 0x%02X cs_s_off = 0x%02X "
727 			"pkt_len %d ngathers %d sop_index %d\n",
728 			stuff_offset, start_offset,
729 			pkt_len, ngathers, sop_index));
730 	} else { /* more than one descriptor and wrap around */
731 		uint32_t nsdescs = tx_ring_p->tx_ring_size - sop_index;
732 		(void) ddi_dma_sync(tx_desc_dma_handle,
733 			sop_index * sizeof (tx_desc_t),
734 			nsdescs * sizeof (tx_desc_t),
735 			DDI_DMA_SYNC_FORDEV);
736 		NXGE_DEBUG_MSG((nxgep, TX_CTL, "nxge_start(20): sync 1 "
737 			"cs_off = 0x%02X cs_s_off = 0x%02X "
738 			"pkt_len %d ngathers %d sop_index %d\n",
739 			stuff_offset, start_offset,
740 				pkt_len, ngathers, sop_index));
741 
742 		(void) ddi_dma_sync(tx_desc_dma_handle,
743 			0,
744 			(ngathers - nsdescs) * sizeof (tx_desc_t),
745 			DDI_DMA_SYNC_FORDEV);
746 		NXGE_DEBUG_MSG((nxgep, TX_CTL, "nxge_start(21): sync 2 "
747 			"cs_off = 0x%02X cs_s_off = 0x%02X "
748 			"pkt_len %d ngathers %d sop_index %d\n",
749 			stuff_offset, start_offset,
750 			pkt_len, ngathers, sop_index));
751 	}
752 
753 	tail_index = tx_ring_p->wr_index;
754 	tail_wrap = tx_ring_p->wr_index_wrap;
755 
756 	tx_ring_p->wr_index = i;
757 	if (tx_ring_p->wr_index <= tail_index) {
758 		tx_ring_p->wr_index_wrap = ((tail_wrap == B_TRUE) ?
759 						B_FALSE : B_TRUE);
760 	}
761 
762 	tx_ring_p->descs_pending += ngathers;
763 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start: TX kick: "
764 		"channel %d wr_index %d wrap %d ngathers %d desc_pend %d",
765 		tx_ring_p->tdc,
766 		tx_ring_p->wr_index,
767 		tx_ring_p->wr_index_wrap,
768 		ngathers,
769 		tx_ring_p->descs_pending));
770 
771 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start: TX KICKING: "));
772 
773 	{
774 		tx_ring_kick_t		kick;
775 
776 		kick.value = 0;
777 		kick.bits.ldw.wrap = tx_ring_p->wr_index_wrap;
778 		kick.bits.ldw.tail = (uint16_t)tx_ring_p->wr_index;
779 
780 		/* Kick start the Transmit kick register */
781 		TXDMA_REG_WRITE64(NXGE_DEV_NPI_HANDLE(nxgep),
782 			TX_RING_KICK_REG,
783 			(uint8_t)tx_ring_p->tdc,
784 			kick.value);
785 	}
786 
787 	tdc_stats->tx_starts++;
788 
789 	MUTEX_EXIT(&tx_ring_p->lock);
790 
791 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_start"));
792 
793 	return (status);
794 
795 nxge_start_fail2:
796 	if (good_packet == B_FALSE) {
797 		cur_index = sop_index;
798 		NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_start: clean up"));
799 		for (i = 0; i < ngathers; i++) {
800 			tx_desc_p = &tx_desc_ring_vp[cur_index];
801 			npi_handle.regp = (uint64_t)tx_desc_p;
802 			tx_msg_p = &tx_msg_ring[cur_index];
803 			(void) npi_txdma_desc_set_zero(npi_handle, 1);
804 			if (tx_msg_p->flags.dma_type == USE_DVMA) {
805 				NXGE_DEBUG_MSG((nxgep, TX_CTL,
806 					"tx_desc_p = %X index = %d",
807 					tx_desc_p, tx_ring_p->rd_index));
808 				(void) dvma_unload(
809 						tx_msg_p->dvma_handle,
810 						0, -1);
811 				tx_msg_p->dvma_handle = NULL;
812 				if (tx_ring_p->dvma_wr_index ==
813 					tx_ring_p->dvma_wrap_mask)
814 					tx_ring_p->dvma_wr_index = 0;
815 				else
816 					tx_ring_p->dvma_wr_index++;
817 				tx_ring_p->dvma_pending--;
818 			} else if (tx_msg_p->flags.dma_type ==
819 					USE_DMA) {
820 				if (ddi_dma_unbind_handle(
821 					tx_msg_p->dma_handle))
822 					cmn_err(CE_WARN, "!nxge_start: "
823 						"ddi_dma_unbind_handle failed");
824 			}
825 			tx_msg_p->flags.dma_type = USE_NONE;
826 			cur_index = TXDMA_DESC_NEXT_INDEX(cur_index, 1,
827 				tx_ring_p->tx_wrap_mask);
828 
829 		}
830 
831 		nxgep->resched_needed = B_TRUE;
832 	}
833 
834 	MUTEX_EXIT(&tx_ring_p->lock);
835 
836 nxge_start_fail1:
837 	/* Add FMA to check the access handle nxge_hregh */
838 
839 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_start"));
840 
841 	return (status);
842 }
843 
844 boolean_t
845 nxge_send(p_nxge_t nxgep, mblk_t *mp, p_mac_tx_hint_t hp)
846 {
847 	p_tx_ring_t 		*tx_rings;
848 	uint8_t			ring_index;
849 
850 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_send"));
851 
852 	ASSERT(mp->b_next == NULL);
853 
854 	ring_index = nxge_tx_lb_ring_1(mp, nxgep->max_tdcs, hp);
855 	tx_rings = nxgep->tx_rings->rings;
856 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_tx_msg: tx_rings $%p",
857 		tx_rings));
858 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_tx_msg: max_tdcs %d "
859 		"ring_index %d", nxgep->max_tdcs, ring_index));
860 
861 	if (nxge_start(nxgep, tx_rings[ring_index], mp)) {
862 		NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_send: failed "
863 			"ring index %d", ring_index));
864 		return (B_FALSE);
865 	}
866 
867 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_send: ring index %d",
868 		ring_index));
869 
870 	return (B_TRUE);
871 }
872 
873 /*
874  * nxge_m_tx() - send a chain of packets
875  */
876 mblk_t *
877 nxge_m_tx(void *arg, mblk_t *mp)
878 {
879 	p_nxge_t 		nxgep = (p_nxge_t)arg;
880 	mblk_t 			*next;
881 	mac_tx_hint_t		hint;
882 
883 	if (!(nxgep->drv_state & STATE_HW_INITIALIZED)) {
884 		NXGE_DEBUG_MSG((nxgep, DDI_CTL,
885 			"==> nxge_m_tx: hardware not initialized"));
886 		NXGE_DEBUG_MSG((nxgep, DDI_CTL,
887 			"<== nxge_m_tx"));
888 		return (mp);
889 	}
890 
891 	hint.hash =  NULL;
892 	hint.vid =  0;
893 	hint.sap =  0;
894 
895 	while (mp != NULL) {
896 		next = mp->b_next;
897 		mp->b_next = NULL;
898 
899 		/*
900 		 * Until Nemo tx resource works, the mac driver
901 		 * does the load balancing based on TCP port,
902 		 * or CPU. For debugging, we use a system
903 		 * configurable parameter.
904 		 */
905 		if (!nxge_send(nxgep, mp, &hint)) {
906 			mp->b_next = next;
907 			break;
908 		}
909 
910 		mp = next;
911 	}
912 
913 	return (mp);
914 }
915 
916 int
917 nxge_tx_lb_ring_1(p_mblk_t mp, uint32_t maxtdcs, p_mac_tx_hint_t hp)
918 {
919 	uint8_t 		ring_index = 0;
920 	uint8_t 		*tcp_port;
921 	p_mblk_t 		nmp;
922 	size_t 			mblk_len;
923 	size_t 			iph_len;
924 	size_t 			hdrs_size;
925 	uint8_t			hdrs_buf[sizeof (struct  ether_header) +
926 					IP_MAX_HDR_LENGTH + sizeof (uint32_t)];
927 				/*
928 				 * allocate space big enough to cover
929 				 * the max ip header length and the first
930 				 * 4 bytes of the TCP/IP header.
931 				 */
932 
933 	boolean_t		qos = B_FALSE;
934 
935 	NXGE_DEBUG_MSG((NULL, TX_CTL, "==> nxge_tx_lb_ring"));
936 
937 	if (hp->vid) {
938 		qos = B_TRUE;
939 	}
940 	switch (nxge_tx_lb_policy) {
941 	case NXGE_TX_LB_TCPUDP: /* default IPv4 TCP/UDP */
942 	default:
943 		tcp_port = mp->b_rptr;
944 		if (!nxge_no_tx_lb && !qos &&
945 			(ntohs(((p_ether_header_t)tcp_port)->ether_type)
946 				== ETHERTYPE_IP)) {
947 			nmp = mp;
948 			mblk_len = MBLKL(nmp);
949 			tcp_port = NULL;
950 			if (mblk_len > sizeof (struct ether_header) +
951 					sizeof (uint8_t)) {
952 				tcp_port = nmp->b_rptr +
953 					sizeof (struct ether_header);
954 				mblk_len -= sizeof (struct ether_header);
955 				iph_len = ((*tcp_port) & 0x0f) << 2;
956 				if (mblk_len > (iph_len + sizeof (uint32_t))) {
957 					tcp_port = nmp->b_rptr;
958 				} else {
959 					tcp_port = NULL;
960 				}
961 			}
962 			if (tcp_port == NULL) {
963 				hdrs_size = 0;
964 				((p_ether_header_t)hdrs_buf)->ether_type = 0;
965 				while ((nmp) && (hdrs_size <
966 						sizeof (hdrs_buf))) {
967 					mblk_len = MBLKL(nmp);
968 					if (mblk_len >=
969 						(sizeof (hdrs_buf) - hdrs_size))
970 						mblk_len = sizeof (hdrs_buf) -
971 							hdrs_size;
972 					bcopy(nmp->b_rptr,
973 						&hdrs_buf[hdrs_size], mblk_len);
974 					hdrs_size += mblk_len;
975 					nmp = nmp->b_cont;
976 				}
977 				tcp_port = hdrs_buf;
978 			}
979 			tcp_port += sizeof (ether_header_t);
980 			if (!(tcp_port[6] & 0x3f) && !(tcp_port[7] & 0xff)) {
981 				if ((tcp_port[9] == IPPROTO_TCP) ||
982 						(tcp_port[9] == IPPROTO_UDP)) {
983 					tcp_port += ((*tcp_port) & 0x0f) << 2;
984 					ring_index =
985 						((tcp_port[1] ^ tcp_port[3])
986 						% maxtdcs);
987 				} else {
988 					ring_index = tcp_port[19] % maxtdcs;
989 				}
990 			} else { /* fragmented packet */
991 				ring_index = tcp_port[19] % maxtdcs;
992 			}
993 		} else {
994 			ring_index = mp->b_band % maxtdcs;
995 		}
996 		break;
997 
998 	case NXGE_TX_LB_HASH:
999 		if (hp->hash) {
1000 			ring_index = ((uint64_t)(hp->hash) % maxtdcs);
1001 		} else {
1002 			ring_index = mp->b_band % maxtdcs;
1003 		}
1004 		break;
1005 
1006 	case NXGE_TX_LB_DEST_MAC: /* Use destination MAC address */
1007 		tcp_port = mp->b_rptr;
1008 		ring_index = tcp_port[5] % maxtdcs;
1009 		break;
1010 	}
1011 
1012 	NXGE_DEBUG_MSG((NULL, TX_CTL, "<== nxge_tx_lb_ring"));
1013 
1014 	return (ring_index);
1015 }
1016 
1017 uint_t
1018 nxge_reschedule(caddr_t arg)
1019 {
1020 	p_nxge_t nxgep;
1021 
1022 	nxgep = (p_nxge_t)arg;
1023 
1024 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_reschedule"));
1025 
1026 	if (nxgep->nxge_mac_state == NXGE_MAC_STARTED &&
1027 			nxgep->resched_needed) {
1028 		mac_tx_update(nxgep->mach);
1029 		nxgep->resched_needed = B_FALSE;
1030 		nxgep->resched_running = B_FALSE;
1031 	}
1032 
1033 	NXGE_DEBUG_MSG((NULL, TX_CTL, "<== nxge_reschedule"));
1034 	return (DDI_INTR_CLAIMED);
1035 }
1036